Multiple boxplots on the same graph - python

I need to create multiple boxplots on the same graph. The sports are 3. I need to obtain 3 boxplots on the same graph of each sport, with a specific variable on the y-axis. I need to be able to change the variable. The variable for each student is registered various times and is given by the mean of the 3 largest numbers. I have 30 students identified with an ID (that goes from 1 to 30). Each student does only one sport. This is what I wrote but clearly it doesn't work. Can someone help? I hope my explanation made sense.
def boxplot(sport, variable):
list=[]
for l in range(1,31):
g = df[(df.ID == l) & (df.sport == sport)][variable].nlargest(n=3).mean()
list.append(g)
my_dict = {f'Boxplot for {variable} in {sport}': list}
fig, ax = plt.subplots()
ax.boxplot(my_dict.values())
ax.set_xticklabels(my_dict.keys())
plt.show()

Here's one way to do it.
import plotly.express as px
df = px.data.tips()
fig = px.box(df, x="day", y="total_bill", color="smoker")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()
If you data is not melted or stacked, you can change the layout like this.
https://pandas.pydata.org/docs/reference/api/pandas.melt.html
Finally, for Matplotlib, you can do it like this.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Results of the long jump finals at two Olympic Games
data = pd.DataFrame({
'London 2012 (Men)': [8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
'Rio 2016 (Men)': [8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05],
'London 2012 (Women)': [7.12, 7.07, 6.89, 6.88, 6.77, 6.76, 6.72, 6.67],
'Rio 2016 (Women)': [7.17, 7.15, 7.08, 6.95, 6.81, 6.79, 6.74, 6.69]
})
# Plot
bp = plt.boxplot(
# A data frame needs to be converted to an array before it can be plotted this way
np.array(data),
# You can use the column headings from the data frame as labels
labels=list(data)
)
# Axis details
plt.title('Long Jump Finals')
plt.ylabel('Distance [m]')
plt.xlabel('Olympics')
plt.minorticks_on()
plt.tick_params(axis='x', which='minor', bottom=False)
plt.tick_params(axis='x', which='major', labelsize='small')
plt.show()
Here is one final update. Make sure the y-axis is numeric...
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plot
df = px.data.tips()
df=pd.DataFrame(df)
print(type(df))
df.head()
df.columns = ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
b_plot = df.boxplot(column = ['tip','size','total_bill'])
b_plot.plot()
plot.show()

Related

How to change the order of subplots matplotlib

I have a data frame that contains average concentrations for 4 different sites based on season and year. The code I wrote produces a figure for each site, with four subplots for each season. Year is on the y-axis and concentration is on the x-axis.
Here's the link to my data: https://drive.google.com/file/d/1mVAsjRiFmMXaW0F8HBhadi1ZQPcUGIa7/view?usp=sharing
The issue is that the code automatically plots the subplots as
fall - spring
summer - winter
I want them to plot in chronological order, because that makes more sense that alphabetical:
spring - summer
fall - winter
Here is my code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import scipy.stats
main_dataframe = pd.read_csv('NOx_sznl.csv')
main_dataframe.rename(columns={'NOx_3168':'Banning NOx', 'NOx_2199':'Palm Springs NOx', 'NOx_2551':'El Centro NOx', 'NOx_3135':'Calexico NOx'}, inplace=True)
col = list(main_dataframe.columns)
col.remove('Year')
col.remove('Season')
for ind,station in enumerate(col):
df_new = main_dataframe[['Season', 'Year', col[ind]]]
###here I tried to reorder the seasons in the dataframe
df_new = df_new.set_index('Season')
df_new = df_new.loc[['Spring', 'Summer', 'Fall', 'Winter'], :]
df_new = df_new.reset_index()
###but it didn't change the outcome
df_new = df_new.set_index('Year')
# df_new['Betty Jo Mcneece Receiving Home'].astype('float')
df_new[col[ind]] = df_new[col[ind]]
grouped = df_new.groupby('Season')
rowlength = grouped.ngroups/2 # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(15,10),
nrows=2, ncols=int(rowlength), # fix as above
gridspec_kw=dict(hspace=0.4))#, sharex='col', sharey='row') # Much control of gridspec
targets = zip(grouped.groups.keys(), axs.flatten())
for i, (key, ax) in enumerate(targets):
ax.plot(grouped.get_group(key)[col[ind]], marker='o', color='orange')
ax.set_ylim(0,)
ax.set_yticks(ax.get_yticks(),size=12)
#ax.set_xlim(2009,2020)
ax.set_xticks(np.arange(2009,2020,1))
ax.set_xticklabels(ax.get_xticks(), rotation = 45, size=12)
fig.suptitle("%s"%col[ind], fontsize=30)
# ax.set_title('%s')
plt.subplot(221)
plt.gca().set_title('Fall', fontsize=20)
plt.subplot(222)
plt.gca().set_title('Spring', fontsize=20)
plt.subplot(223)
plt.gca().set_title('Summer', fontsize=20)
plt.subplot(224)
plt.gca().set_title('Winter', fontsize=20)
plt.show()
I would apppreciate any help rearranging the subplots.
The order of the subplots is given by grouped.groups.keys() in targets = zip(grouped.groups.keys(), axs.flatten()) but the problem is further upstream in grouped = df_new.groupby('Season') which is where grouped.groups.keys() comes from. df.groupby() automatically sorts alphabetically unless you do sort=False, so grouped = df_new.groupby('Season', sort=False) should follow the order you provided when you made df_new.
Here is what your code looks like on my end so you can have an exact copy.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import scipy.stats
main_dataframe = pd.read_csv('NOx_sznl.csv')
main_dataframe.rename(columns={'NOx_3168': 'Banning NOx',
'NOx_2199': 'Palm Springs NOx',
'NOx_2551': 'El Centro NOx',
'NOx_3135': 'Calexico NOx'},
inplace=True)
col = list(main_dataframe.columns)
col.remove('Year')
col.remove('Season')
for ind, station in enumerate(col):
df_new = main_dataframe[['Season', 'Year', col[ind]]]
###here I tried to reorder the seasons in the dataframe
df_new = df_new.set_index('Season')
df_new = df_new.loc[['Spring', 'Summer', 'Fall', 'Winter'], :]
df_new = df_new.reset_index()
###but it didn't change the outcome
df_new = df_new.set_index('Year')
# df_new['Betty Jo Mcneece Receiving Home'].astype('float')
df_new[col[ind]] = df_new[col[ind]]
grouped = df_new.groupby('Season', sort=False)
rowlength = grouped.ngroups/2 # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(15,10),
nrows=2, ncols=int(rowlength), # fix as above
gridspec_kw=dict(hspace=0.4))#, sharex='col', sharey='row') # Much control of gridspec
targets = zip(grouped.groups.keys(), axs.flatten())
for i, (key, ax) in enumerate(targets):
ax.plot(grouped.get_group(key)[col[ind]], marker='o', color='orange')
ax.set_ylim(0,)
ax.set_yticks(ax.get_yticks(),size=12)
#ax.set_xlim(2009,2020)
ax.set_xticks(np.arange(2009,2020,1))
ax.set_xticklabels(ax.get_xticks(), rotation = 45, size=12)
ax.set_title(key)
fig.suptitle("%s"%col[ind], fontsize=30)
plt.show()

Series not showing up on plots

I've been trying to work through the code in this function and cannot get my series to show up on my plots. Possibly there is an easier way to do this. In each plot I want display each of the 7 entities, in a time series with 1 indicator.
I'm struggling with how to group values by both year, and country. I am new to python and data science so I appreciate any help.
Here is a link to the csv data from the World Bank
https://datacatalog.worldbank.org/search/dataset/0037712
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 7)
raw = pd.read_csv('WDIData.csv')
countries = ['BIH', 'HRV', 'MKD', 'MNE', 'SRB', 'SVN', 'EUU']
colors = {
'Bosnia and Herzegovina': "#66C2A5",
'Croatia': "#FA8D62",
'North Macedonia': "#F7BA20",
'Montenegro': "#E68AC3",
'Serbia': "#8D9FCA",
'Slovenia': "#A6D853",
'avg. EU': "#CCCCCC"
}
i = 0
df = raw[raw['Country Code'].isin(countries)].copy()
pre_1990 = [str(x) for x in range(1960, 1990)]
df.drop(pre_1990, axis=1, inplace=True)
df = df.rename(columns={'Country Name': 'CountryName', 'Country Code': 'CountryCode', 'Indicator Name': 'IndicatorName', 'Indicator Code': 'IndicatorCode'})
columns = ['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode']
df = pd.melt(df, id_vars=columns, var_name='Year', value_name='Value')
df.dropna(inplace=True)
def plot_indicator(indicators, title=None,
xlim=None, ylim=None, xspace=None,
loc=0, loc2=0,
drop_eu=False, filename=None):
lines = ['-', '--']
line_styles = []
fig, ax = plt.subplots()
indicators = indicators if isinstance(indicators, list) else [indicators]
for line, (name, indicator) in zip(lines, indicators):
ls, = plt.plot(np.nan, linestyle=line, color='#999999')
line_styles.append([ls, name])
df_ind = df[(df.IndicatorCode == indicator)]
group = df_ind.groupby(['CountryName'])
for country, values in group:
country_values = values.groupby('Year').mean()
if country == 'European Union':
if drop_eu:
continue
ax.plot(country_values, label=country,
linestyle='--', color='#666666', linewidth=1, zorder=1)
elif country_values.shape[0] > 1:
ax.plot(country_values, label=country, linestyle=line,
color=colors[country], linewidth=2.5)
if line == lines[0]:
legend = plt.legend(loc=loc)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
if xlim and xspace:
ax.set_xticks(np.arange(xlim[0], xlim[1]+1, xspace))
plt.tight_layout()
fig.subplots_adjust(top=0.94)
if title:
ax.set_title(title)
else:
ax.set_title(df_ind.IndicatorName.values[0])
if len(indicators) > 1:
plt.legend(*zip(*line_styles), loc=loc2)
ax.add_artist(legend)
population = [
('pop_dens', 'EN.POP.DNST'), # Population density
('rural', 'SP.RUR.TOTL.ZS'), # Rural population
('under14', 'SP.POP.0014.TO.ZS'),# Population, ages 0-14
('above65', 'SP.POP.65UP.TO.ZS'),# Population ages 65 and above
]
for indicator in population:
plot_indicator(indicator, loc=0, xlim=(1990, 2020))
EDIT
I have re-written this answer to be more clear and concise.
This is a clever bit of code! I found the problem, it was with xlim. As the years are strings, not integers, the x-axis is index-based, not integer-based. This means that when you pass the range between 1990 and 2020 you are looking the 1990th to 2020th values! Obviously, there are not this many values (only 30 years between 1990 and 2020), so there was no data within that range, thus the blank plot.
If you change the code within the function to ax.set_xlim(xlim[0]-int(df_ind['Year'].min()), xlim[1]-int(df_ind['Year'].min())) then you can pass the year and it will subtract the minimum year to give the appropriate index values. I would also add plt.xticks(rotation=45) underneath to stop the ticks overlapping.
ALTERNATIVELY!! (this is the option I would choose):
You can simply change the DataFrame column type to integer, then everything you have remains unchanged. Underneath df.dropna(inplace=True) (just before the function), you can add df['Year'] = df['Year'].astype(int), which solves the problem with the non-integer x-axis above.
Once one or the other has been changed, you should be able to see the lines of the plots.

How to create a min-max lineplot by month

I have retail beef ad counts time series data, and I intend to make stacked line chart aim to show On a three-week average basis, quantity of average ads that grocers posted per store last week. To do so, I managed to aggregate data for plotting and tried to make line chart that I want. The main motivation is based on context of the problem and desired plot. In my attempt, I couldn't get very nice line chart because it is not informative to understand. I am wondering how can I achieve this goal in matplotlib. Can anyone suggest me what should I do from my current attempt? Any thoughts?
reproducible data and current attempt
Here is minimal reproducible data that I used in my current attempt:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import timedelta, datetime
url = 'https://gist.githubusercontent.com/adamFlyn/96e68902d8f71ad62a4d3cda135507ad/raw/4761264cbd55c81cf003a4219fea6a24740d7ce9/df.csv'
df = pd.read_csv(url, parse_dates=['date'])
df.drop(columns=['Unnamed: 0'], inplace=True)
df_grp = df.groupby(['date', 'retail_item']).agg({'number_of_ads': 'sum'})
df_grp["percentage"] = df_grp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
df_grp = df_grp.reset_index(level=[0,1])
for item in df_grp['retail_item'].unique():
dd = df_grp[df_grp['retail_item'] == item].groupby(['date', 'percentage'])[['number_of_ads']].sum().reset_index(level=[0,1])
dd['weakly_change'] = dd[['percentage']].rolling(7).mean()
fig, ax = plt.subplots(figsize=(8, 6), dpi=144)
sns.lineplot(dd.index, 'weakly_change', data=dd, ax=ax)
ax.set_xlim(dd.index.min(), dd.index.max())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gcf().autofmt_xdate()
plt.style.use('ggplot')
plt.xticks(rotation=90)
plt.show()
Current Result
but I couldn't get correct line chart that I expected, I want to reproduce the plot from this site. Is that doable to achieve this? Any idea?
desired plot
here is the example desired plot that I want to make from this minimal reproducible data:
I don't know how should make changes for my current attempt to get my desired plot above. Can anyone know any possible way of doing this in matplotlib? what else should I do? Any possible help would be appreciated. Thanks
Also see How to create a min-max plot by month with fill_between?
See in-line comments for details
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
#################################################################
# setup from question
url = 'https://gist.githubusercontent.com/adamFlyn/96e68902d8f71ad62a4d3cda135507ad/raw/4761264cbd55c81cf003a4219fea6a24740d7ce9/df.csv'
df = pd.read_csv(url, parse_dates=['date'])
df.drop(columns=['Unnamed: 0'], inplace=True)
df_grp = df.groupby(['date', 'retail_item']).agg({'number_of_ads': 'sum'})
df_grp["percentage"] = df_grp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
df_grp = df_grp.reset_index(level=[0,1])
#################################################################
# create a month map from long to abbreviated calendar names
month_map = dict(zip(calendar.month_name[1:], calendar.month_abbr[1:]))
# update the month column name
df_grp['month'] = df_grp.date.dt.month_name().map(month_map)
# set month as categorical so they are plotted in the correct order
df_grp.month = pd.Categorical(df_grp.month, categories=month_map.values(), ordered=True)
# use groupby to aggregate min mean and max
dfmm = df_grp.groupby(['retail_item', 'month'])['percentage'].agg([max, min, 'mean']).stack().reset_index(level=[2]).rename(columns={'level_2': 'mm', 0: 'vals'}).reset_index()
# create a palette map for line colors
cmap = {'min': 'k', 'max': 'k', 'mean': 'b'}
# iterate through each retail item and plot the corresponding data
for g, d in dfmm.groupby('retail_item'):
plt.figure(figsize=(7, 4))
sns.lineplot(x='month', y='vals', hue='mm', data=d, palette=cmap)
# select only min or max data for fill_between
y1 = d[d.mm == 'max']
y2 = d[d.mm == 'min']
plt.fill_between(x=y1.month, y1=y1.vals, y2=y2.vals, color='gainsboro')
# add lines for specific years
for year in [2016, 2018, 2020]:
data = df_grp[(df_grp.date.dt.year == year) & (df_grp.retail_item == g)]
sns.lineplot(x='month', y='percentage', ci=None, data=data, label=year)
plt.ylim(0, 100)
plt.margins(0, 0)
plt.legend(bbox_to_anchor=(1., 1), loc='upper left')
plt.ylabel('Percentage of Ads')
plt.title(g)
plt.show()

How to plot two bar graphs side by side with columns from different dataframes in pandas

I want to plot two bar graphs side by side using matplotlib/seaborn for two countries Covid-19 confirmed cases: Italy and India for comparison. However after trying many methods I couldn't achieve the problem. Confirmed cases of both countries are coming from two different data frames.
Data source
I want to plot 'Dates' column on x-axis and 'Confirmed cases count' on y-axis.
Attaching images of my code for reference.
P.S: I am new to data visualization and pandas too.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('https://raw.githubusercontent.com/datasets/covid-
19/master/data/countries-aggregated.csv', parse_dates = ['Date'])
df.head(5)
ind_cnfd = df[['Date', 'Country', 'Confirmed']]
ind_cnfd = ind_cnfd[ind_cnfd['Country']=='India']
italy_cnfd = df[['Date', 'Country', 'Confirmed']]
italy_cnfd = italy_cnfd[italy_cnfd['Country'] == 'Italy']
Expected output kind of this:
With dates on x-axis and confirmed cases on y-axis
Here's an example of what you can put together using matplotlib with seaborn. Feel free to play around with the axes settings, spacing, and so on by looking through matplotlib/seaborn documentation. Take note that I only did import matplotlib.pyplot as plt if you want to run any of this code from your notebook. I didn't use seaborn by the way.
You can optionally display the confirmed cases on a log-based y scale with the line: plt.yscale('log')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('https://raw.githubusercontent.com/datasets/covid-19/master/data/countries-aggregated.csv',
parse_dates = ['Date'])
# select the Date, Country, Confirmed features from country, with reset of index
ind_cnfd = df[df.Country == 'India']
ind_cnfd = ind_cnfd[['Date', 'Confirmed']].reset_index(drop = True)
ind_cnfd = ind_cnfd.rename(columns = {'Confirmed': 'Confirmed Cases in India'})
italy_cnfd = df[df.Country == 'Italy']
italy_cnfd = italy_cnfd[['Date', 'Confirmed']].reset_index(drop = True)
italy_cnfd = italy_cnfd.rename(columns = {'Confirmed': 'Confirmed Cases in Italy'})
# combine dataframes together, turn the date column into the index
df_cnfd = pd.concat([ind_cnfd.drop(columns = 'Date'), italy_cnfd], axis = 1)
df_cnfd['Date'] = df_cnfd['Date'].dt.date
df_cnfd.set_index('Date', inplace=True)
# make a grouped bar plot time series
ax = df_cnfd.plot.bar()
# show every other tick label
for label in ax.xaxis.get_ticklabels()[::2]:
label.set_visible(False)
# add titles, axis labels
plt.suptitle("Confirmed COVID-19 Cases over Time", fontsize = 15)
plt.xlabel("Dates")
plt.ylabel("Number of Confirmed Cases")
plt.tight_layout()
# plt.yscale('log')
plt.show()

Color by Column Values in Matplotlib

One of my favorite aspects of using the ggplot2 library in R is the ability to easily specify aesthetics. I can quickly make a scatterplot and apply color associated with a specific column and I would love to be able to do this with python/pandas/matplotlib. I'm wondering if there are there any convenience functions that people use to map colors to values using pandas dataframes and Matplotlib?
##ggplot scatterplot example with R dataframe, `df`, colored by col3
ggplot(data = df, aes(x=col1, y=col2, color=col3)) + geom_point()
##ideal situation with pandas dataframe, 'df', where colors are chosen by col3
df.plot(x=col1,y=col2,color=col3)
EDIT:
Thank you for your responses but I want to include a sample dataframe to clarify what I am asking. Two columns contain numerical data and the third is a categorical variable. The script I am thinking of will assign colors based on this value.
np.random.seed(250)
df = pd.DataFrame({'Height': np.append(np.random.normal(6, 0.25, size=5), np.random.normal(5.4, 0.25, size=5)),
'Weight': np.append(np.random.normal(180, 20, size=5), np.random.normal(140, 20, size=5)),
'Gender': ["Male","Male","Male","Male","Male",
"Female","Female","Female","Female","Female"]})
Height Weight Gender
0 5.824970 159.210508 Male
1 5.780403 180.294943 Male
2 6.318295 199.142201 Male
3 5.617211 157.813278 Male
4 6.340892 191.849944 Male
5 5.625131 139.588467 Female
6 4.950479 146.711220 Female
7 5.617245 121.571890 Female
8 5.556821 141.536028 Female
9 5.714171 134.396203 Female
Imports and Data
import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
seaborn.set(style='ticks')
numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
'Gender': numpy.random.choice(_genders, size=N)
})
Update August 2021
With seaborn 0.11.0, it's recommended to use new figure level functions like seaborn.relplot than to use FacetGrid directly.
sns.relplot(data=df, x='Weight (kg)', y='Height (cm)', hue='Gender', hue_order=_genders, aspect=1.61)
plt.show()
Update October 2015
Seaborn handles this use-case splendidly:
Map matplotlib.pyplot.scatter onto a seaborn.FacetGrid
fg = sns.FacetGrid(data=df, hue='Gender', hue_order=_genders, aspect=1.61)
fg.map(plt.scatter, 'Weight (kg)', 'Height (cm)').add_legend()
Which immediately outputs:
Old Answer
In this case, I would use matplotlib directly.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def dfScatter(df, xcol='Height', ycol='Weight', catcol='Gender'):
fig, ax = plt.subplots()
categories = np.unique(df[catcol])
colors = np.linspace(0, 1, len(categories))
colordict = dict(zip(categories, colors))
df["Color"] = df[catcol].apply(lambda x: colordict[x])
ax.scatter(df[xcol], df[ycol], c=df.Color)
return fig
if 1:
df = pd.DataFrame({'Height':np.random.normal(size=10),
'Weight':np.random.normal(size=10),
'Gender': ["Male","Male","Unknown","Male","Male",
"Female","Did not respond","Unknown","Female","Female"]})
fig = dfScatter(df)
fig.savefig('fig1.png')
And that gives me:
As far as I know, that color column can be any matplotlib compatible color (RBGA tuples, HTML names, hex values, etc).
I'm having trouble getting anything but numerical values to work with the colormaps.
Actually you could use ggplot for python:
from ggplot import *
import numpy as np
import pandas as pd
df = pd.DataFrame({'Height':np.random.randn(10),
'Weight':np.random.randn(10),
'Gender': ["Male","Male","Male","Male","Male",
"Female","Female","Female","Female","Female"]})
ggplot(aes(x='Height', y='Weight', color='Gender'), data=df) + geom_point()
https://seaborn.pydata.org/generated/seaborn.scatterplot.html
import numpy
import pandas
import seaborn as sns
numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
'Gender': numpy.random.choice(_genders, size=N)
})
sns.scatterplot(data=df, x='Height (cm)', y='Weight (kg)', hue='Gender')
You can use the color parameter to the plot method to define the colors you want for each column. For example:
from pandas import DataFrame
data = DataFrame({'a':range(5),'b':range(1,6),'c':range(2,7)})
colors = ['yellowgreen','cyan','magenta']
data.plot(color=colors)
You can use color names or Color hex codes like '#000000' for black say. You can find all the defined color names in matplotlib's color.py file. Below is the link for the color.py file in matplotlib's github repo.
https://github.com/matplotlib/matplotlib/blob/master/lib/matplotlib/colors.py
The OP is coloring by a categorical column, but this answer is for coloring by a column that is numeric, or can be interpreted as numeric, such as a datetime dtype.
pandas.DataFrame.plot and matplotlib.pyplot.scatter can take a c or color parameter, which must be a color, a sequence of colors, or a sequence of numbers.
Tested in python 3.8, pandas 1.3.1, and matplotlib 3.4.2
Choosing Colormaps in Matplotlib for other valid cmap options.
Imports and Test Data
'Date' is already a datetime64[ns] dtype from DataReader
conda install -c anaconda pandas-datareader or pip install pandas-datareader depending on your environment.
import pandas as pd
import matplotlib.pyplot as plt
import pandas_datareader as web # for data; not part of pandas
tickers = 'amzn'
df = web.DataReader(ticker, data_source='yahoo', start='2018-01-01', end='2021-01-01').reset_index()
df['ticker'] = ticker
Date High Low Open Close Volume Adj Close ticker
0 2018-01-02 1190.00000 1170.510010 1172.000000 1189.010010 2694500 1189.010010 amzn
1 2018-01-03 1205.48999 1188.300049 1188.300049 1204.199951 3108800 1204.199951 amzn
c as a number
pandas.DataFrame.plot
df.Date.dt.month creates a pandas.Series of month numbers
ax = df.plot(kind='scatter', x='Date', y='High', c=df.Date.dt.month, cmap='Set3', figsize=(11, 4), title='c parameter as a month number')
plt.show()
matplotlib.pyplot.scatter
fig, ax = plt.subplots(figsize=(11, 4))
ax.scatter(data=df, x='Date', y='High', c=df.Date.dt.month, cmap='Set3')
ax.set(title='c parameter as a month number', xlabel='Date', ylabel='High')
plt.show()
c as a datetime dtype
pandas.DataFrame.plot
ax = df.plot(kind='scatter', x='Date', y='High', c='Date', cmap='winter', figsize=(11, 4), title='c parameter as a datetime dtype')
plt.show()
matplotlib.pyplot.scatter
fig, ax = plt.subplots(figsize=(11, 4))
ax.scatter(data=df, x='Date', y='High', c='Date', cmap='winter')
ax.set(title='c parameter as a datetime dtype', xlabel='Date', ylabel='High')
plt.show()
Though not matplotlib, you can achieve this using plotly express:
import numpy as np
import pandas as pd
import plotly.express as px
df = pd.DataFrame({
'Height':np.random.normal(size=10),
'Weight':np.random.normal(size=10),
'Size': 1, # How large each point should be?
'Gender': ["Male","Male","Male","Male","Male","Female","Female","Female","Female","Female"]})
# Create your plot
px.scatter(df, x='Weight', y='Height', size='Size', color='Gender')
If creating in a notebook, you'll get an interactive output like the following:

Categories

Resources