Struggling to insert a pie chart in Python - python

I have made a pie chart using an excel sheet but it is coming out incomplete. I am not sure of the reason. Here is the code:
import matlotplib.pyplot as plt
import pandas as pd
import numpy as np
Employee=pd.read_excel("C:\\Users\\Jon\\Desktop\\data science\\Employee.xlsx")
Employee
colors = ["#1f77b4", "#ff7f0e"]
group_by_departments=Employee.groupby("Department").count().reset_index()
sizes = group_by_departments['Gender']
labels = group_by_departments['Department']
plt.pie(sizes, labels=labels, colors = colors,autopct='%.2f %%')
plt.show()

You can use .size() to get the count for each group. You'll need to group by Department and Gender simultaneously to obtain the individual counts of all the subgroups.
Here is some example code:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
N = 100
Employee = pd.DataFrame({'Gender': np.random.choice(['Male', 'Female'], N),
'Department': np.random.choice(['IT', 'Sales', 'HR', 'Finance'], N),
'Age': np.random.randint(20, 65, N),
'Salary': np.random.randint(20, 100, N) * 1000})
colors = ["turquoise", "tomato"]
group_by_departments_and_gender = Employee.groupby(["Department", "Gender"]).size().reset_index(name='Counts')
sizes = group_by_departments_and_gender['Counts']
labels = [f'{dept}\n {gender}' for dept, gender in group_by_departments_and_gender[['Department', 'Gender']].values]
plt.pie(sizes, labels=labels, colors=colors, autopct='%.2f %%')
plt.tight_layout()
plt.show()
PS: You could assign a color per gender via:
colors = ["magenta" if gender=="Male" else "deepskyblue" for gender in group_by_departments_and_gender["Gender"]]
This especially helps in case one of the genders wouldn't be present in one of the departments.

Related

Showing values on barplot

The following code generates a pdf file that is fed by a looping barplot over a data frame. My goal is to annotate values over the bars. I have already tried various times to monitor the values but failed. May I get any help on this? Thanks
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'],
targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
pp = PdfPages('mypath/extraction.pdf')
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
pp.savefig(plt.gcf())
pp.close()

Choosing Another Color Palette for a Mosaic Plot

I have this mosaic:
from statsmodels.graphics.mosaicplot import mosaic
mosaic_data = pd.DataFrame({'gender': labeled_gender, 'y': labeled_y})
mosaic(mosaic_data, ['gender','y'], title='Mosaic of Heart Disease Vs. Gender', ax=ax, properties={})
I simply want to change the color palette to the colors / palette of my choice. Is there a way to do that? Also is there a way to reach & change other properties of the plot, for example color of the labels inside the rectangles?
To change the color you need to provide a mapping that matches the names. Don't think you can change the label color easily:
from statsmodels.graphics.mosaicplot import mosaic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1,figsize=(6,6))
gender = np.repeat(['male','female'],30)
heart_disease = [np.random.choice(['heart disease','no heart disease'],30,p=[0.8,0.2]),
np.random.choice(['heart disease','no heart disease'],30,p=[0.5,0.5])]
data = pd.DataFrame({'gender': gender, 'heart disease': np.array(heart_disease).flatten()})
cols = {('male', 'heart disease'):'#9a1f40',('male', 'no heart disease'):'#d9455f',
('female','heart disease' ):'#74d4c0', ('female', 'no heart disease'):'#def4f0'}
x = mosaic(data,['gender','heart disease'],
properties = lambda key: {'color': cols[key]} ,
ax=ax,gap=0.01)

Plotting percentage of totals with pandas group bys

I am trying to plot a bar chart of a pandas data frame that is the result of two group bys.
In particular, my data frame looks exactly like the output from another SO post's answer (https://stackoverflow.com/a/23377155/7243972):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in range(12)]})
state_office = df.groupby(['state', 'office_id']).agg({'sales': 'sum'})
state = df.groupby(['state']).agg({'sales': 'sum'})
results = state_office.div(state, level='state') * 100
I would like to plot results so that each state is a different color and the office_id is on the x-axis. This is so that each office_id is grouped together and they can be easily compared.
I've tried adjusting the plot from results['sales'].plot.bar(), but I am struggling.
First you need to flatten the dataframe:
data = []
for row in results.iterrows():
state, office_id = row[0]
sales = row[1][0]
data.append((state, office_id, sales))
flat_df = pd.DataFrame(data, columns=['state', 'office_id', 'sales'])
then plot
import seaborn as sns
sns.set(style="whitegrid")
g = sns.factorplot(x="office_id", y="sales", hue="state", data=flat_df, kind="bar", palette="muted")
edit: just realized there is a simpler way to flatten the dataframe:
flat_df = results.reset_index(inplace=False)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in
range(12)]})
state_office = df.groupby(['state', 'office_id']).agg({'sales': 'sum'})
state = df.groupby(['state']).agg({'sales': 'sum'})
results = state_office.div(state, level='state') * 100
results = results.reset_index()
fig, ax = plt.subplots()
for c, df in results.groupby('state'):
ax.scatter(df['office_id'], df['sales'], label=c)
ax.legend()
ax.set_title('Scatterplot')
ax.set_xlabel('office_id')
ax.set_ylabel('sales')
This prints a scatterplot. See if you can take it from here!

Multiple histograms in Pandas

I would like to create the following histogram (see image below) taken from the book "Think Stats". However, I cannot get them on the same plot. Each DataFrame takes its own subplot.
I have the following code:
import nsfg
import matplotlib.pyplot as plt
df = nsfg.ReadFemPreg()
preg = nsfg.ReadFemPreg()
live = preg[preg.outcome == 1]
first = live[live.birthord == 1]
others = live[live.birthord != 1]
#fig = plt.figure()
#ax1 = fig.add_subplot(111)
first.hist(column = 'prglngth', bins = 40, color = 'teal', \
alpha = 0.5)
others.hist(column = 'prglngth', bins = 40, color = 'blue', \
alpha = 0.5)
plt.show()
The above code does not work when I use ax = ax1 as suggested in: pandas multiple plots not working as hists nor this example does what I need: Overlaying multiple histograms using pandas. When I use the code as it is, it creates two windows with histograms. Any ideas how to combine them?
Here's an example of how I'd like the final figure to look:
As far as I can tell, pandas can't handle this situation. That's ok since all of their plotting methods are for convenience only. You'll need to use matplotlib directly. Here's how I do it:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas
#import seaborn
#seaborn.set(style='ticks')
np.random.seed(0)
df = pandas.DataFrame(np.random.normal(size=(37,2)), columns=['A', 'B'])
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(df['A'])
b_heights, b_bins = np.histogram(df['B'], bins=a_bins)
width = (a_bins[1] - a_bins[0])/3
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue')
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='seagreen')
#seaborn.despine(ax=ax, offset=10)
And that gives me:
In case anyone wants to plot one histogram over another (rather than alternating bars) you can simply call .hist() consecutively on the series you want to plot:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas
np.random.seed(0)
df = pandas.DataFrame(np.random.normal(size=(37,2)), columns=['A', 'B'])
df['A'].hist()
df['B'].hist()
This gives you:
Note that the order you call .hist() matters (the first one will be at the back)
A quick solution is to use melt() from pandas and then plot with seaborn.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# make dataframe
df = pd.DataFrame(np.random.normal(size=(200,2)), columns=['A', 'B'])
# plot melted dataframe in a single command
sns.histplot(df.melt(), x='value', hue='variable',
multiple='dodge', shrink=.75, bins=20);
Setting multiple='dodge' makes it so the bars are side-by-side, and shrink=.75 makes it so the pair of bars take up 3/4 of the whole bin.
To help understand what melt() did, these are the dataframes df and df.melt():
From the pandas website (http://pandas.pydata.org/pandas-docs/stable/visualization.html#visualization-hist):
df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000),
'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c'])
plt.figure();
df4.plot(kind='hist', alpha=0.5)
You make two dataframes and one matplotlib axis
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df1 = pd.DataFrame({
'data1': np.random.randn(10),
'data2': np.random.randn(10)
})
df2 = df1.copy()
fig, ax = plt.subplots()
df1.hist(column=['data1'], ax=ax)
df2.hist(column=['data2'], ax=ax)
Here is the snippet, In my case I have explicitly specified bins and range as I didn't handle outlier removal as the author of the book.
fig, ax = plt.subplots()
ax.hist([first.prglngth, others.prglngth], 10, (27, 50), histtype="bar", label=("First", "Other"))
ax.set_title("Histogram")
ax.legend()
Refer Matplotlib multihist plot with different sizes example.
this could be done with brevity
plt.hist([First, Other], bins = 40, color =('teal','blue'), label=("First", "Other"))
plt.legend(loc='best')
Note that as the number of bins increase, it may become a visual burden.
You could also try to check out the pandas.DataFrame.plot.hist() function which will plot the histogram of each column of the dataframe in the same figure.
Visibility is limited though but you can check out if it helps!
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.hist.html

Color by Column Values in Matplotlib

One of my favorite aspects of using the ggplot2 library in R is the ability to easily specify aesthetics. I can quickly make a scatterplot and apply color associated with a specific column and I would love to be able to do this with python/pandas/matplotlib. I'm wondering if there are there any convenience functions that people use to map colors to values using pandas dataframes and Matplotlib?
##ggplot scatterplot example with R dataframe, `df`, colored by col3
ggplot(data = df, aes(x=col1, y=col2, color=col3)) + geom_point()
##ideal situation with pandas dataframe, 'df', where colors are chosen by col3
df.plot(x=col1,y=col2,color=col3)
EDIT:
Thank you for your responses but I want to include a sample dataframe to clarify what I am asking. Two columns contain numerical data and the third is a categorical variable. The script I am thinking of will assign colors based on this value.
np.random.seed(250)
df = pd.DataFrame({'Height': np.append(np.random.normal(6, 0.25, size=5), np.random.normal(5.4, 0.25, size=5)),
'Weight': np.append(np.random.normal(180, 20, size=5), np.random.normal(140, 20, size=5)),
'Gender': ["Male","Male","Male","Male","Male",
"Female","Female","Female","Female","Female"]})
Height Weight Gender
0 5.824970 159.210508 Male
1 5.780403 180.294943 Male
2 6.318295 199.142201 Male
3 5.617211 157.813278 Male
4 6.340892 191.849944 Male
5 5.625131 139.588467 Female
6 4.950479 146.711220 Female
7 5.617245 121.571890 Female
8 5.556821 141.536028 Female
9 5.714171 134.396203 Female
Imports and Data
import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
seaborn.set(style='ticks')
numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
'Gender': numpy.random.choice(_genders, size=N)
})
Update August 2021
With seaborn 0.11.0, it's recommended to use new figure level functions like seaborn.relplot than to use FacetGrid directly.
sns.relplot(data=df, x='Weight (kg)', y='Height (cm)', hue='Gender', hue_order=_genders, aspect=1.61)
plt.show()
Update October 2015
Seaborn handles this use-case splendidly:
Map matplotlib.pyplot.scatter onto a seaborn.FacetGrid
fg = sns.FacetGrid(data=df, hue='Gender', hue_order=_genders, aspect=1.61)
fg.map(plt.scatter, 'Weight (kg)', 'Height (cm)').add_legend()
Which immediately outputs:
Old Answer
In this case, I would use matplotlib directly.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def dfScatter(df, xcol='Height', ycol='Weight', catcol='Gender'):
fig, ax = plt.subplots()
categories = np.unique(df[catcol])
colors = np.linspace(0, 1, len(categories))
colordict = dict(zip(categories, colors))
df["Color"] = df[catcol].apply(lambda x: colordict[x])
ax.scatter(df[xcol], df[ycol], c=df.Color)
return fig
if 1:
df = pd.DataFrame({'Height':np.random.normal(size=10),
'Weight':np.random.normal(size=10),
'Gender': ["Male","Male","Unknown","Male","Male",
"Female","Did not respond","Unknown","Female","Female"]})
fig = dfScatter(df)
fig.savefig('fig1.png')
And that gives me:
As far as I know, that color column can be any matplotlib compatible color (RBGA tuples, HTML names, hex values, etc).
I'm having trouble getting anything but numerical values to work with the colormaps.
Actually you could use ggplot for python:
from ggplot import *
import numpy as np
import pandas as pd
df = pd.DataFrame({'Height':np.random.randn(10),
'Weight':np.random.randn(10),
'Gender': ["Male","Male","Male","Male","Male",
"Female","Female","Female","Female","Female"]})
ggplot(aes(x='Height', y='Weight', color='Gender'), data=df) + geom_point()
https://seaborn.pydata.org/generated/seaborn.scatterplot.html
import numpy
import pandas
import seaborn as sns
numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
'Gender': numpy.random.choice(_genders, size=N)
})
sns.scatterplot(data=df, x='Height (cm)', y='Weight (kg)', hue='Gender')
You can use the color parameter to the plot method to define the colors you want for each column. For example:
from pandas import DataFrame
data = DataFrame({'a':range(5),'b':range(1,6),'c':range(2,7)})
colors = ['yellowgreen','cyan','magenta']
data.plot(color=colors)
You can use color names or Color hex codes like '#000000' for black say. You can find all the defined color names in matplotlib's color.py file. Below is the link for the color.py file in matplotlib's github repo.
https://github.com/matplotlib/matplotlib/blob/master/lib/matplotlib/colors.py
The OP is coloring by a categorical column, but this answer is for coloring by a column that is numeric, or can be interpreted as numeric, such as a datetime dtype.
pandas.DataFrame.plot and matplotlib.pyplot.scatter can take a c or color parameter, which must be a color, a sequence of colors, or a sequence of numbers.
Tested in python 3.8, pandas 1.3.1, and matplotlib 3.4.2
Choosing Colormaps in Matplotlib for other valid cmap options.
Imports and Test Data
'Date' is already a datetime64[ns] dtype from DataReader
conda install -c anaconda pandas-datareader or pip install pandas-datareader depending on your environment.
import pandas as pd
import matplotlib.pyplot as plt
import pandas_datareader as web # for data; not part of pandas
tickers = 'amzn'
df = web.DataReader(ticker, data_source='yahoo', start='2018-01-01', end='2021-01-01').reset_index()
df['ticker'] = ticker
Date High Low Open Close Volume Adj Close ticker
0 2018-01-02 1190.00000 1170.510010 1172.000000 1189.010010 2694500 1189.010010 amzn
1 2018-01-03 1205.48999 1188.300049 1188.300049 1204.199951 3108800 1204.199951 amzn
c as a number
pandas.DataFrame.plot
df.Date.dt.month creates a pandas.Series of month numbers
ax = df.plot(kind='scatter', x='Date', y='High', c=df.Date.dt.month, cmap='Set3', figsize=(11, 4), title='c parameter as a month number')
plt.show()
matplotlib.pyplot.scatter
fig, ax = plt.subplots(figsize=(11, 4))
ax.scatter(data=df, x='Date', y='High', c=df.Date.dt.month, cmap='Set3')
ax.set(title='c parameter as a month number', xlabel='Date', ylabel='High')
plt.show()
c as a datetime dtype
pandas.DataFrame.plot
ax = df.plot(kind='scatter', x='Date', y='High', c='Date', cmap='winter', figsize=(11, 4), title='c parameter as a datetime dtype')
plt.show()
matplotlib.pyplot.scatter
fig, ax = plt.subplots(figsize=(11, 4))
ax.scatter(data=df, x='Date', y='High', c='Date', cmap='winter')
ax.set(title='c parameter as a datetime dtype', xlabel='Date', ylabel='High')
plt.show()
Though not matplotlib, you can achieve this using plotly express:
import numpy as np
import pandas as pd
import plotly.express as px
df = pd.DataFrame({
'Height':np.random.normal(size=10),
'Weight':np.random.normal(size=10),
'Size': 1, # How large each point should be?
'Gender': ["Male","Male","Male","Male","Male","Female","Female","Female","Female","Female"]})
# Create your plot
px.scatter(df, x='Weight', y='Height', size='Size', color='Gender')
If creating in a notebook, you'll get an interactive output like the following:

Categories

Resources