How to plot k-means clustering results in an ordered way - python

I am using k-mans clustering as a means of customer and product segmentation. I found a function on stack that takes the cluster results and reorders them based on the average value of a target value in the dataframe. This seems to be working quite well but in order to plot the results I am first creating a string column in the data frame based on the ordered clustering to prevent seaborn from creating bins in the hue labels. The first problem I ran into was that while the plot and labels were being generated as intended the legend was out of order. I added a hue order but the ledgend becomes fixed to this order so changing the value of K makes the legend confusing. I added a function to address this problem as well and everything seems to be working as intended but I would like to know if there are any better ways of achiving this. I will place the related code blocks bellow.
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
#adding column to dataframe based on clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(data[['ORDERS_PLACED','UNITS_SOLD','AVG_UNIT_PRICE','TOTAL_SALES']])
data['Rank'] = kmeans.predict(data[['ORDERS_PLACED','UNITS_SOLD','AVG_UNIT_PRICE','TOTAL_SALES']])
#ordering the results
data = order_cluster('Rank','TOTAL_SALES',data,True)
top = data['Rank'].max()
#adding string column to dataframe
data['Rank_ID'] = [('Group_A' if x == top else
('Group_B' if x == top - 1 else
('Group_C' if x == top - 2 else
('Group_D' if x == top - 3 else
('Group_E' if x == top - 4 else
('Group_F' if x == top - 5 else
('Group_G' if x == top - 6 else
('Group_H' if x == top - 7 else
('Group_I' if x == top - 8 else
('Group_J' if x == top - 9 else 'Group_Z')))))))))
) for x in data['Rank']]
#function to build the plot legend values
def build_legend(k_value):
if k_value == 0:
legend = ['Group_A']
elif k_value == 1:
legend = ['Group_A','Group_B']
elif k_value == 2:
legend = ['Group_A','Group_B','Group_C']
elif k_value == 3:
legend = ['Group_A','Group_B','Group_C','Group_D']
elif k_value == 4:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E']
elif k_value == 5:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F']
elif k_value == 6:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G']
elif k_value == 7:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H']
elif k_value == 8:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I']
elif k_value == 9:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I','Group_J']
else:
legend = ['Group_A','Group_B','Group_C','Group_D','Group_E','Group_F','Group_G','Group_H','Group_I','Group_J','Group_Z']
return legend
#plotting the results
orderHue = build_legend(top)
fig, ax = plt.subplots(figsize=(12,5))
plot = sns.scatterplot(x='ORDERS_PLACED', y='TOTAL_SALES', hue='Rank_ID', size='Rank_ID',
hue_order=orderHue, size_order=orderHue, data=report, ax=ax)
ytick = plot.get_yticks()
plot.set_yticklabels(['{:,.0f}'.format(x) for x in ytick])
plot.set_title('80/20 Customer Segmentation Using K-Means Clustering, Plot on Orders Placed & Total Sales',fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show(plot)
This seems like a lot of code to achive what might be quite simple.
Here is a quick sample of the data as requested,
CUSTOMER_ID ORDERS_PLACED UNITS_SOLD AVG_UNIT_PRICE TOTAL_SALES
A 2 59 21553.9 1271680
B 106 184 6295.9 1158445.7
C 13 78 14290 1114620
D 43 2034 245.38 499102
E 53 582 760.92 442856
F 1 6 15000 90000
G 3 60 967 58020
H 1 1 1807 1807

Related

Reorder Sankey diagram vertically based on label value

I'm trying to plot patient flows between 3 clusters in a Sankey diagram. I have a pd.DataFrame counts with from-to values, see below. To reproduce this DF, here is the counts dict that should be loaded into a pd.DataFrame (which is the input for the visualize_cluster_flow_counts function).
from to value
0 C1_1 C1_2 867
1 C1_1 C2_2 405
2 C1_1 C0_2 2
3 C2_1 C1_2 46
4 C2_1 C2_2 458
... ... ... ...
175 C0_20 C0_21 130
176 C0_20 C2_21 1
177 C2_20 C1_21 12
178 C2_20 C0_21 0
179 C2_20 C2_21 96
The from and to values in the DataFrame represent the cluster number (either 0, 1, or 2) and the amount of days for the x-axis (between 1 and 21). If I plot the Sankey diagram with these values, this is the result:
Code:
import plotly.graph_objects as go
def visualize_cluster_flow_counts(counts):
all_sources = list(set(counts['from'].values.tolist() + counts['to'].values.tolist()))
froms, tos, vals, labs = [], [], [], []
for index, row in counts.iterrows():
froms.append(all_sources.index(row.values[0]))
tos.append(all_sources.index(row.values[1]))
vals.append(row[2])
labs.append(row[3])
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = dict(
pad = 15,
thickness = 5,
line = dict(color = "black", width = 0.1),
label = all_sources,
color = "blue"
),
link = dict(
source = froms,
target = tos,
value = vals,
label = labs
))])
fig.update_layout(title_text="Patient flow between clusters over time: 48h (2 days) - 504h (21 days)", font_size=10)
fig.show()
visualize_cluster_flow_counts(counts)
However, I would like to vertically order the bars so that the C0's are always on top, the C1's are always in the middle, and the C2's are always at the bottom (or the other way around, doesn't matter). I know that we can set node.x and node.y to manually assign the coordinates. So, I set the x-values to the amount of days * (1/range of days), which is an increment of +- 0.045. And I set the y-values based on the cluster value: either 0, 0.5 or 1. I then obtain the image below. The vertical order is good, but the vertical margins between the bars are obviously way off; they should be similar to the first result.
The code to produce this is:
import plotly.graph_objects as go
def find_node_coordinates(sources):
x_nodes, y_nodes = [], []
for s in sources:
# Shift each x with +- 0.045
x = float(s.split("_")[-1]) * (1/21)
x_nodes.append(x)
# Choose either 0, 0.5 or 1 for the y-value
cluster_number = s[1]
if cluster_number == "0": y = 1
elif cluster_number == "1": y = 0.5
else: y = 1e-09
y_nodes.append(y)
return x_nodes, y_nodes
def visualize_cluster_flow_counts(counts):
all_sources = list(set(counts['from'].values.tolist() + counts['to'].values.tolist()))
node_x, node_y = find_node_coordinates(all_sources)
froms, tos, vals, labs = [], [], [], []
for index, row in counts.iterrows():
froms.append(all_sources.index(row.values[0]))
tos.append(all_sources.index(row.values[1]))
vals.append(row[2])
labs.append(row[3])
fig = go.Figure(data=[go.Sankey(
arrangement='snap',
node = dict(
pad = 15,
thickness = 5,
line = dict(color = "black", width = 0.1),
label = all_sources,
color = "blue",
x = node_x,
y = node_y,
),
link = dict(
source = froms,
target = tos,
value = vals,
label = labs
))])
fig.update_layout(title_text="Patient flow between clusters over time: 48h (2 days) - 504h (21 days)", font_size=10)
fig.show()
visualize_cluster_flow_counts(counts)
Question: how do I fix the margins of the bars, so that the result looks like the first result? So, for clarity: the bars should be pushed to the bottom. Or is there another way that the Sankey diagram can vertically re-order the bars automatically based on the label value?
Firstly I don't think there is a way with the current exposed API to achieve your goal smoothly you can check the source code here.
Try to change your find_node_coordinates function as follows (note that you should pass the counts DataFrame to):
counts = pd.DataFrame(counts_dict)
def find_node_coordinates(sources, counts):
x_nodes, y_nodes = [], []
flat_on_top = False
range = 1 # The y range
total_margin_width = 0.15
y_range = 1 - total_margin_width
margin = total_margin_width / 2 # From number of Cs
srcs = counts['from'].values.tolist()
dsts = counts['to'].values.tolist()
values = counts['value'].values.tolist()
max_acc = 0
def _calc_day_flux(d=1):
_max_acc = 0
for i in [0,1,2]:
# The first ones
from_source = 'C{}_{}'.format(i,d)
indices = [i for i, val in enumerate(srcs) if val == from_source]
for j in indices:
_max_acc += values[j]
return _max_acc
def _calc_node_io_flux(node_str):
c,d = int(node_str.split('_')[0][-1]), int(node_str.split('_')[1])
_flux_src = 0
_flux_dst = 0
indices_src = [i for i, val in enumerate(srcs) if val == node_str]
indices_dst = [j for j, val in enumerate(dsts) if val == node_str]
for j in indices_src:
_flux_src += values[j]
for j in indices_dst:
_flux_dst += values[j]
return max(_flux_dst, _flux_src)
max_acc = _calc_day_flux()
graph_unit_per_val = y_range / max_acc
print("Graph Unit per Acc Val", graph_unit_per_val)
for s in sources:
# Shift each x with +- 0.045
d = int(s.split("_")[-1])
x = float(d) * (1/21)
x_nodes.append(x)
print(s, _calc_node_io_flux(s))
# Choose either 0, 0.5 or 1 for the y-v alue
cluster_number = s[1]
# Flat on Top
if flat_on_top:
if cluster_number == "0":
y = _calc_node_io_flux('C{}_{}'.format(2, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(0, d))*graph_unit_per_val/2
elif cluster_number == "1": y = _calc_node_io_flux('C{}_{}'.format(2, d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1, d))*graph_unit_per_val/2
else: y = 1e-09
# Flat On Bottom
else:
if cluster_number == "0": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val / 2)
elif cluster_number == "1": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1,d)) * graph_unit_per_val /2 )
elif cluster_number == "2": y = 1 - (_calc_node_io_flux('C{}_{}'.format(0,d))*graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(1,d)) * graph_unit_per_val + margin + _calc_node_io_flux('C{}_{}'.format(2,d)) * graph_unit_per_val /2 )
y_nodes.append(y)
return x_nodes, y_nodes
Sankey graphs supposed to weigh their connection width by their corresponding normalized values right? Here I do the same, first, it calculates each node flux, later by calculating the normalized coordinate the center of each node calculated according to their flux.
Here is the sample output of your code with the modified function, note that I tried to adhere to your code as much as possible so it's a bit unoptimized(for example, one could store the values of nodes above each specified source node to avoid its flux recalculation).
With flag flat_on_top = True
With flag flat_on_top = False
There is a bit of inconsistency in the flat_on_bottom version which I think is caused by the padding or other internal sources of Plotly API.

How can I debug this plotting issue?

The columns are:
Company
Location
DateTime
Details
Outcome: Prelaunch Failure / Partial Failure / Failure / Success
Write a program to do the following:
Create a pie plot that shows the distribution of mission outcomes.
Create a horizontal bar plot showing the total number of missions for each company, with the most at the top and the fewest at the bottom.
Create the same horizontal bar plot, but with the missions grouped into success and failure, with success in green and failure in red.
For 3. I should be getting a plot as below:
But I am getting as following:
I don't understand what I am doing wrong, how can I fix it? Here is my code:
import pandas as pd
import matplotlib.pyplot as plt
#Read the data file into dataframe
df = pd.read_csv("data.csv")
#Length of dataframe
total = len(df)
#Success percentage
success = (len(df[df['Outcome'] == 'Success'])/total) * 100
#Failure percentage
failure = (len(df[df['Outcome'] == 'Failure'])/total) * 100
#Partial failure percentage
partial_failure = (len(df[df['Outcome'] == 'Partial Failure'])/total) * 100
#Prelaunch failure percentage
prelaunch_failure = (len(df[df['Outcome'] == 'Prelaunch Failure'])/total) * 100
dict_1 = {}
# Iterate through company column
for k in df['Company']:
if k in dict_1:
dict_1[k] += 1
else:
dict_1[k] = 1
sorted_dict = {}
#Sort dictionary
sorted_keys = sorted(dict_1, key=dict_1.get, reverse=True)
for w in sorted_keys:
# Add sorted values in the new dictionary
sorted_dict[w] = dict_1[w]
dict_3 = {}
dict_4 = {}
#Iterate through unique Companies
for c in df['Company'].unique():
failure = 0
#Get only the outcome column of each company
comp_proj = df.loc[df['Company'] == c, 'Outcome']
for h in comp_proj:
if h == 'Failure' or h == 'Partial Failure' or h == 'Prelaunch Failure':
failure = failure + 1
dict_3[c] = failure
dict_2 = {}
for l in dict_2.keys():
dict_4[l] = dict_3[l]
# Plot Pie Chart
figure1, ax1 = plt.subplots(figsize=(10,15))
status = [success,failure,partial_failure,prelaunch_failure]
label = ["success","failure","partial","prelaunch"]
ax1.pie(status,labels=label)
ax1.set_ylabel("Outcome")
plt.savefig('plot 1.png')
# Plot the first bar chart
figure2, ax2 = plt.subplots(figsize=(15,15))
ax2.barh(range(len(sorted_dict)), sorted_dict.values(), align='center')
ax2.set_yticks(range(len(sorted_dict)))
ax2.set_yticklabels(sorted_dict.keys())
ax2.set_ylabel("Company")
ax2.invert_yaxis()
plt.savefig('plot 2.png')
# Plot the second bar chart
figure3, ax3 = plt.subplots(figsize=(15,15))
ax3.barh(range(len(sorted_dict)), sorted_dict.values(), align='center', color='green')
ax3.barh(range(len(dict_4)), dict_4.values(), align='center', color='red')
ax3.set_yticks(range(len(sorted_dict)))
ax3.set_yticklabels(sorted_dict.keys())
ax3.set_ylabel("Company")
ax3.legend(["Success", "Failure"])
ax3.invert_yaxis()
plt.savefig('plot 3.png')
Sample of data:

Having trouble getting subplots to show up correctly with larger data sets

I am having trouble getting subplots to show up correctly with larger data sets.
I am ok with having the figure grow for my application. I am also ok with having the figure grow such that all the graphs would be about the size of the ones showing up in the small data set example if that is possible. (anaconda3/v4.2.0/python)
plt.rcParams['figure.autolayout']=True
figa, axa = plt.subplots(rowcnt, colcnt)
figa.suptitle("Users Disk Space Usage Over Time.\n")
ax_index = 0
for r in range(rowcnt)
for c in range(colcnt):
n = r * c
user = gr.columns[n]
ur = gr[user]
x = ur.index
y = ur.values
while is_color_like(colorpairs[colorindex]) == False or is_color_like(colorpairs[colorindex+1]) == False :
colorindex = int((colorindex + 2) % (len(colorpairs)/2))
axa[r,c].plot(x, y, color=colorpairs[colorindex+1], alpha=0.6)
plt.setp(axa[r,c].get_xticklabels(), rotation=30)
if len(x) > 1:
axa[r,c].fill_between(x, y, color=colorpairs[colorindex],alpha=0.4)
axa[r,c].set_ylim(0,disksizebytes)
axa[r,c].set_title(user)
axa[r,c].set_xlabel('date')
axa[r,c].set_ylabel('space used')
axa[r,c].grid(True)
i += 1
colorindex = int((colorindex + 2) % (len(colorpairs)/2))
detailarryimage = "{}/detailarryimage.png".format(datafolder)
figa.savefig(detailarryimage)
Small Set Image
Large Set Image

Segmentation Algorithm

I had created a segmentation algorithm that is able to detect levels above a certain threshold and alternates between 2 categories (red and green) as shown in Figure 1 below. My current algorithm always take reference from the left and detects the first segment as red, followed by green and so on.
However, I would like the algorithm to take reference starting from the center of the longest 0s gap (10-01-06 to 10-01-14) outwards, where the first segment detected on the left will always be green and the first segment detected on the right will always be red. The longest gap can be anywhere and may not always be at the center of the dataset.
I would like the algorithm to return individual list for red and green, with their respective starting and ending point indexes.
I have attached the codes below to replicate the plot.
Figure1:
# Creating Dummy Dataset
df = pd.DataFrame(np.random.uniform(50,100,size=(100, 1)))
df2 = pd.DataFrame(np.random.uniform(50,100,size=(100, 1)))
zeros1 = pd.DataFrame(np.zeros(80))
zeros2 = pd.DataFrame(np.zeros(50))
zeros3 = pd.DataFrame(np.zeros(500))
zeros4 = pd.DataFrame(np.zeros(80))
zeros5 = pd.DataFrame(np.zeros(50))
df3 = pd.DataFrame(np.random.uniform(50,100,size=(100, 1)))
df4 = pd.DataFrame(np.random.uniform(50,100,size=(100, 1)))
df5=pd.concat([zeros1, df, zeros2, df2, zeros3, df3, zeros4, df4, zeros5 ], ignore_index=True)
times = pd.date_range('2012-10-01', periods=len(df5), freq='1min')
df6 = pd.concat([pd.DataFrame(times), df5], axis = 1, ignore_index=True)
segment = []
for i in range(0,len(df6)):
if df6.iloc[i,1]> 50:
segment.append(99)
else:
segment.append(0)
# Segmentation Algo
state_v,state_p = (0,0) # cycling through states (0,0), (99,0), (0,1), (99,1)
segments = ([],[])
for i,v in enumerate(segment):
if state_v == 0:
if v == 99:
start = i
state_v = 99
elif state_v == 99:
if v == 0:
end = i
segments[state_p].append((start, end))
state_v = 0
state_p = 1 - state_p
if state_v == 99:
end = len(segment)
segments[state_p].append((start, end))
# Plot
door_open, door_close = segments
plt.plot(df6[0], df6[1])
for o in door_open:
plt.axvline(df6[0][o[0]], linewidth=1, color='r', linestyle= '-')
plt.axvline(df6[0][o[1]], linewidth=1, color='r', linestyle= '-')
for c in door_close:
plt.axvline(df6[0][c[0]], linewidth=1, color='g', linestyle= '-')
plt.axvline(df6[0][c[1]], linewidth=1, color='g', linestyle= '-')
plt.xticks(rotation='vertical')
plt.show()

How to plot 3 or more values in plot.bar()

I tried to make plot.bar() using 2 values having them in a list, but I'm unable to plot 3 values.
I tried to add plot.bar(x,y,z), but it didn't work.
ce_data = ce_data.drop(
['pchangeinOpenInterest', 'totalTradedVolume', 'impliedVolatility', # this removes unecesssary items
'pChange', 'totalBuyQuantity', 'totalSellQuantity', 'bidQty',
'bidprice', 'askQty', 'askPrice', 'askQty', 'identifier', 'lastPrice', 'change', 'expiryDate',
'underlying'], axis=1)[
['openInterest', 'changeinOpenInterest', 'strikePrice', 'underlyingValue']]
style.use('ggplot')
ce_data.to_csv('kumar.csv')
df = pd.read_csv('kumar.csv', parse_dates=True, index_col=0)
pivot = df.iloc[2, 3] # this selects the strike price
pivot_round = round(pivot, -2) # round of the price
x = df['strikePrice'].tolist()
y = df['changeinOpenInterest'].tolist()
z = df['openInterest'].tolist()
for i in range(len(x)):
if int(x[i]) >= pivot_round - 400:
xleftpos = i
break
for i in range(len(x)):
if int(x[i]) >= pivot_round + 400:
xrightpos = i
break
x = x[xleftpos:xrightpos]
y = y[xleftpos:xrightpos]
z = z[xleftpos:xrightpos]
plot.bar([value for value in range(len(x))],y)
plot.set_xticks([idx + 0.5 for idx in range(len(x))])
plot.set_xticklabels(x, rotation=35, ha='right', size=10)
I am expecting strike price in x axis and y and z (change in oi and oi) in as bars.
IIUC, here's how I'd do it. This should have a single x-axis w/ 'strikePrice' and two bars of 'changeinOpenInterest' and 'openInterest'.
disp_df = df.pivot('strikePrice', 'changeinOpenInterest', 'openInterest')
disp_df.plot(kind='bar')
You can add the bells and whistles you want to the plot, but this avoids a lot of the manipulation you did above.

Categories

Resources