The columns are:
Company
Location
DateTime
Details
Outcome: Prelaunch Failure / Partial Failure / Failure / Success
Write a program to do the following:
Create a pie plot that shows the distribution of mission outcomes.
Create a horizontal bar plot showing the total number of missions for each company, with the most at the top and the fewest at the bottom.
Create the same horizontal bar plot, but with the missions grouped into success and failure, with success in green and failure in red.
For 3. I should be getting a plot as below:
But I am getting as following:
I don't understand what I am doing wrong, how can I fix it? Here is my code:
import pandas as pd
import matplotlib.pyplot as plt
#Read the data file into dataframe
df = pd.read_csv("data.csv")
#Length of dataframe
total = len(df)
#Success percentage
success = (len(df[df['Outcome'] == 'Success'])/total) * 100
#Failure percentage
failure = (len(df[df['Outcome'] == 'Failure'])/total) * 100
#Partial failure percentage
partial_failure = (len(df[df['Outcome'] == 'Partial Failure'])/total) * 100
#Prelaunch failure percentage
prelaunch_failure = (len(df[df['Outcome'] == 'Prelaunch Failure'])/total) * 100
dict_1 = {}
# Iterate through company column
for k in df['Company']:
if k in dict_1:
dict_1[k] += 1
else:
dict_1[k] = 1
sorted_dict = {}
#Sort dictionary
sorted_keys = sorted(dict_1, key=dict_1.get, reverse=True)
for w in sorted_keys:
# Add sorted values in the new dictionary
sorted_dict[w] = dict_1[w]
dict_3 = {}
dict_4 = {}
#Iterate through unique Companies
for c in df['Company'].unique():
failure = 0
#Get only the outcome column of each company
comp_proj = df.loc[df['Company'] == c, 'Outcome']
for h in comp_proj:
if h == 'Failure' or h == 'Partial Failure' or h == 'Prelaunch Failure':
failure = failure + 1
dict_3[c] = failure
dict_2 = {}
for l in dict_2.keys():
dict_4[l] = dict_3[l]
# Plot Pie Chart
figure1, ax1 = plt.subplots(figsize=(10,15))
status = [success,failure,partial_failure,prelaunch_failure]
label = ["success","failure","partial","prelaunch"]
ax1.pie(status,labels=label)
ax1.set_ylabel("Outcome")
plt.savefig('plot 1.png')
# Plot the first bar chart
figure2, ax2 = plt.subplots(figsize=(15,15))
ax2.barh(range(len(sorted_dict)), sorted_dict.values(), align='center')
ax2.set_yticks(range(len(sorted_dict)))
ax2.set_yticklabels(sorted_dict.keys())
ax2.set_ylabel("Company")
ax2.invert_yaxis()
plt.savefig('plot 2.png')
# Plot the second bar chart
figure3, ax3 = plt.subplots(figsize=(15,15))
ax3.barh(range(len(sorted_dict)), sorted_dict.values(), align='center', color='green')
ax3.barh(range(len(dict_4)), dict_4.values(), align='center', color='red')
ax3.set_yticks(range(len(sorted_dict)))
ax3.set_yticklabels(sorted_dict.keys())
ax3.set_ylabel("Company")
ax3.legend(["Success", "Failure"])
ax3.invert_yaxis()
plt.savefig('plot 3.png')
Sample of data:
Related
This is a code for a waterfall chart. I'd kindly like to ask:
if there is a way to simplify this code. The code is far too long and I'm sure there is a lot of extra lines of code that could be reduced.
How I can make the first and last bars black?. Since I am creating a waterfall chart I am looking for the first and last value to be black at all times and the values in between to be green or red depending on whether or not it is a negative or positive number.
Bars greater than zero green.
Bars less than zero red.
Any help would be greatly appreciated.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
#Use python 2.7+ syntax to format currency
def money(x, pos):
'The two args are the value and tick position'
return "${:,.0f}".format(x)
formatter = FuncFormatter(money)
#Data to plot. Do not include a total, it will be calculated
index = ['sales','returns','credit fees','rebates','late charges','shipping']
data = {'amount': [350000,-30000,-7500,-25000,95000,-7000]}
#Store data and create a blank series to use for the waterfall
trans = pd.DataFrame(data=data,index=index)
blank = trans.amount.cumsum().shift(1).fillna(0)
#Get the net total number for the final element in the waterfall
total = trans.sum().amount
trans.loc["net"]= total
blank.loc["net"] = total
#The steps graphically show the levels as well as used for label placement
step = blank.reset_index(drop=True).repeat(3).shift(-1)
step[1::3] = np.nan
#When plotting the last element, we want to show the full bar,
#Set the blank to 0
blank.loc["net"] = 0
#Plot and label
my_plot = trans.plot(kind='bar', stacked=True, bottom=blank,legend=None, figsize=(10, 5), title="2014 Sales Waterfall")
my_plot.plot(step.index, step.values,'k')
my_plot.set_xlabel("Transaction Types")
#Format the axis for dollars
my_plot.yaxis.set_major_formatter(formatter)
#Get the y-axis position for the labels
y_height = trans.amount.cumsum().shift(1).fillna(0)
#Get an offset so labels don't sit right on top of the bar
max = trans.max()
neg_offset = max / 25
pos_offset = max / 50
plot_offset = int(max / 15)
#Start label loop
loop = 0
for index, row in trans.iterrows():
# For the last item in the list, we don't want to double count
if row['amount'] == total:
y = y_height[loop]
else:
y = y_height[loop] + row['amount']
# Determine if we want a neg or pos offset
if row['amount'] > 0:
y += pos_offset
else:
y -= neg_offset
my_plot.annotate("{:,.0f}".format(row['amount']),(loop,y),ha="center")
loop+=1
#Scale up the y axis so there is room for the labels
my_plot.set_ylim(0,blank.max()+int(plot_offset))
#Rotate the labels
my_plot.set_xticklabels(trans.index,rotation=0)
my_plot.get_figure().savefig("waterfall.png",dpi=200,bbox_inches='tight')
Answer to questions 2, 3 and 4: set the colors of the bar patches after plotting them:
for p, c in zip(my_plot.containers[0].patches, np.r_[0, np.sign(trans.amount[1:-1]), 0]):
p.set_color({0: 'k', 1: 'g', -1: 'r'}[c])
I am having a problem with waterfall. I took this chart from matplotlib site and added my own data frame with 2 simple columns with some integer numbers. My waterfall was produced but without numbers, just empty bars. I am a bit lost and I would appreciate any suggestions.
What I am trying to build is the custom waterfall that takes one dataframe with column names, values, and some values for filters like countries. I haven't found anything like that anywhere so I am trying to build my own.
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
from matplotlib.ticker import FuncFormatter;
dataset = pd.read_csv('waterfall_test_data.csv')
#Use python 2.7+ syntax to format currency
def money(x, pos):
'The two args are the value and tick position'
return "${:,.0f}".format(x)
formatter = FuncFormatter(money)
#Data to plot. Do not include a total, it will be calculated
index = dataset['columns']
data = dataset['amount']
#Store data and create a blank series to use for the waterfall
trans = pd.DataFrame(data=data,index=index)
blank = trans.amount.cumsum().shift(1).fillna(0)
#Get the net total number for the final element in the waterfall
total = trans.sum().amount
trans.loc["net"]= total
blank.loc["net"] = total
#The steps graphically show the levels as well as used for label placement
step = blank.reset_index(drop=True).repeat(3).shift(-1)
step[1::3] = np.nan
#When plotting the last element, we want to show the full bar,
#Set the blank to 0
blank.loc["net"] = 0
#Plot and label
my_plot = trans.plot(kind='bar', stacked=True, bottom=blank,legend=None, figsize=(15, 5), title="2014 Sales Waterfall")
my_plot.plot(step.index, step.values,'k')
my_plot.set_xlabel("Transaction Types")
#Format the axis for dollars
my_plot.yaxis.set_major_formatter(formatter)
#Get the y-axis position for the labels
y_height = trans.amount.cumsum().shift(1).fillna(0)
#Get an offset so labels don't sit right on top of the bar
max = trans.max()
neg_offset = max / 25
pos_offset = max / 50
plot_offset = int(max / 15)
#Start label loop
loop = 0
for index, row in trans.iterrows():
# For the last item in the list, we don't want to double count
if row['amount'] == total:
y = y_height[loop]
else:
y = y_height[loop] + row['amount']
# Determine if we want a neg or pos offset
if row['amount'] > 0:
y += pos_offset
else:
y -= neg_offset
my_plot.annotate("{:,.0f}".format(row['amount']),(loop,y),ha="center")
loop+=1
#Scale up the y axis so there is room for the labels
my_plot.set_ylim(0,blank.max()+int(plot_offset))
#Rotate the labels
my_plot.set_xticklabels(trans.index,rotation=0)
my_plot.get_figure().savefig("waterfall.png",dpi=200,bbox_inches='tight')
i'm trying to plot a simple bar chart of the average 'resale_price' (y-axis) for each flat type against 'town' (x-axis) for data from 2015-2019. However for some reason, my x-axis keeps changing every time I re-run my code. Not sure where I've gone wrong.
dataset: https://data.gov.sg/dataset/resale-flat-prices
here's the code i've used below
labels1 = list(set(data_3room['town']))
town1 = np.arange(0,len(labels1))
town1_values = data_3room[['town','resale_price']]
values1 = town1_values['resale_price']/1000
# print(values1)
avg_values1 = {}
for i in labels1:
valuesfortown1 = values1[town1_values['town']==i]
avg1 = np.average(valuesfortown1)
print("Average 3 Room Resale Price for town " + i + " is {:.0f}".format(avg1))
avg_values1[i] = avg1
from collections import OrderedDict
from operator import itemgetter
avg_values1 = OrderedDict(sorted(avg_values1.items(), key = itemgetter(1), reverse = True))
plt.figure(1, figsize=(30,30))
barchart1 = plt.bar(list(avg_values1.keys()), list(avg_values1.values()), color='#d62728')
for i in range(len(barchart1)):
bar1 = barchart1[i]
x1,y1 = bar1.get_xy()
h1 = bar1.get_height()
plt.text(x1,h1,"{:.0f}".format(list(avg_values1.values())[i]),fontsize=30)
plt.title('3 Room Resale Prices by Town',fontsize=40)
plt.ylabel('Resale Prices (Thousands)',fontsize=40)
plt.yticks(fontsize=20)
plt.xticks(town1, labels1, fontsize=40,rotation='vertical')
I am using k-mans clustering as a means of customer and product segmentation. I found a function on stack that takes the cluster results and reorders them based on the average value of a target value in the dataframe. This seems to be working quite well but in order to plot the results I am first creating a string column in the data frame based on the ordered clustering to prevent seaborn from creating bins in the hue labels. The first problem I ran into was that while the plot and labels were being generated as intended the legend was out of order. I added a hue order but the ledgend becomes fixed to this order so changing the value of K makes the legend confusing. I added a function to address this problem as well and everything seems to be working as intended but I would like to know if there are any better ways of achiving this. I will place the related code blocks bellow.
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
#adding column to dataframe based on clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(data[['ORDERS_PLACED','UNITS_SOLD','AVG_UNIT_PRICE','TOTAL_SALES']])
data['Rank'] = kmeans.predict(data[['ORDERS_PLACED','UNITS_SOLD','AVG_UNIT_PRICE','TOTAL_SALES']])
#ordering the results
data = order_cluster('Rank','TOTAL_SALES',data,True)
top = data['Rank'].max()
#adding string column to dataframe
data['Rank_ID'] = [('Group_A' if x == top else
('Group_B' if x == top - 1 else
('Group_C' if x == top - 2 else
('Group_D' if x == top - 3 else
('Group_E' if x == top - 4 else
('Group_F' if x == top - 5 else
('Group_G' if x == top - 6 else
('Group_H' if x == top - 7 else
('Group_I' if x == top - 8 else
('Group_J' if x == top - 9 else 'Group_Z')))))))))
) for x in data['Rank']]
#function to build the plot legend values
def build_legend(k_value):
if k_value == 0:
legend = ['Group_A']
elif k_value == 1:
legend = ['Group_A','Group_B']
elif k_value == 2:
legend = ['Group_A','Group_B','Group_C']
elif k_value == 3:
legend = ['Group_A','Group_B','Group_C','Group_D']
elif k_value == 4:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E']
elif k_value == 5:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F']
elif k_value == 6:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G']
elif k_value == 7:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H']
elif k_value == 8:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I']
elif k_value == 9:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I','Group_J']
else:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I','Group_J','Group_Z']
return legend
#plotting the results
orderHue = build_legend(top)
fig, ax = plt.subplots(figsize=(12,5))
plot = sns.scatterplot(x='ORDERS_PLACED', y='TOTAL_SALES', hue='Rank_ID', size='Rank_ID',
hue_order=orderHue, size_order=orderHue, data=report, ax=ax)
ytick = plot.get_yticks()
plot.set_yticklabels(['{:,.0f}'.format(x) for x in ytick])
plot.set_title('80/20 Customer Segmentation Using K-Means Clustering, Plot on Orders Placed & Total Sales',fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show(plot)
This seems like a lot of code to achive what might be quite simple.
Here is a quick sample of the data as requested,
CUSTOMER_ID ORDERS_PLACED UNITS_SOLD AVG_UNIT_PRICE TOTAL_SALES
A 2 59 21553.9 1271680
B 106 184 6295.9 1158445.7
C 13 78 14290 1114620
D 43 2034 245.38 499102
E 53 582 760.92 442856
F 1 6 15000 90000
G 3 60 967 58020
H 1 1 1807 1807
I am using python with matplotlib and need to visualize distribution percentage of sub-groups of an data set.
imagine this tree:
Data --- group1 (40%)
-
--- group2 (25%)
-
--- group3 (35%)
group1 --- A (25%)
-
--- B (25%)
-
--- c (50%)
and it can go on, each group can have several sub-groups and same for each sub group.
How can i plot a proper chart for this info?
I created a minimal reproducible example that I think fits your description, but please let me know if that is not what you need.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = pd.DataFrame()
n_rows = 100
data['group'] = np.random.choice(['1', '2', '3'], n_rows)
data['subgroup'] = np.random.choice(['A', 'B', 'C'], n_rows)
For instance, we could get the following counts for the subgroups.
In [1]: data.groupby(['group'])['subgroup'].value_counts()
Out[1]: group subgroup
1 A 17
C 16
B 5
2 A 23
C 10
B 7
3 C 8
A 7
B 7
Name: subgroup, dtype: int64
I created a function that computes the necessary counts given an ordering of the columns (e.g. ['group', 'subgroup']) and incrementally plots the bars with the corresponding percentages.
import matplotlib.pyplot as plt
import matplotlib.cm
def plot_tree(data, ordering, axis=False):
"""
Plots a sequence of bar plots reflecting how the data
is distributed at different levels. The order of the
levels is given by the ordering parameter.
Parameters
----------
data: pandas DataFrame
ordering: list
Names of the columns to be plotted.They should be
ordered top down, from the larger to the smaller group.
axis: boolean
Whether to plot the axis.
Returns
-------
fig: matplotlib figure object.
The final tree plot.
"""
# Frame set-up
fig, ax = plt.subplots(figsize=(9.2, 3*len(ordering)))
ax.set_xticks(np.arange(-1, len(ordering)) + 0.5)
ax.set_xticklabels(['All'] + ordering, fontsize=18)
if not axis:
plt.axis('off')
counts=[data.shape[0]]
# Get colormap
labels = ['All']
for o in reversed(ordering):
labels.extend(data[o].unique().tolist())
# Pastel is nice but has few colors. Change for a larger map if needed
cmap = matplotlib.cm.get_cmap('Pastel1', len(labels))
colors = dict(zip(labels, [cmap(i) for i in range(len(labels))]))
# Group the counts
counts = data.groupby(ordering).size().reset_index(name='c_' + ordering[-1])
for i, o in enumerate(ordering[:-1], 1):
if ordering[:i]:
counts['c_' + o]=counts.groupby(ordering[:i]).transform('sum')['c_' + ordering[-1]]
# Calculate percentages
counts['p_' + ordering[0]] = counts['c_' + ordering[0]]/data.shape[0]
for i, o in enumerate(ordering[1:], 1):
counts['p_' + o] = counts['c_' + o]/counts['c_' + ordering[i-1]]
# Plot first bar - all data
ax.bar(-1, data.shape[0], width=1, label='All', color=colors['All'], align="edge")
ax.annotate('All -- 100%', (-0.9, 0.5), fontsize=12)
comb = 1 # keeps track of the number of possible combinations at each level
for bar, col in enumerate(ordering):
labels = sorted(data[col].unique())*comb
comb *= len(data[col].unique())
# Get only the relevant counts at this level
local_counts = counts[ordering[:bar+1] +
['c_' + o for o in ordering[:bar+1]] +
['p_' + o for o in ordering[:bar+1]]].drop_duplicates()
sizes = local_counts['c_' + col]
percs = local_counts['p_' + col]
bottom = 0 # start at from 0
for size, perc, label in zip(sizes, percs, labels):
ax.bar(bar, size, width=1, bottom=bottom, label=label, color=colors[label], align="edge")
ax.annotate('{} -- {:.0%}'.format(label, perc), (bar+0.1, bottom+0.5), fontsize=12)
bottom += size # stack the bars
ax.legend(colors)
return fig
With the data shown above we would get the following.
fig = plot_tree(data, ['group', 'subgroup'], axis=True)
Have you tried stacked bar graph?
https://matplotlib.org/gallery/lines_bars_and_markers/bar_stacked.html#sphx-glr-gallery-lines-bars-and-markers-bar-stacked-py