No handles with labels found to put in legend. plt.legend() - python

I just learned python, this is literally my first lesson and i was told to make kmeans with python. and while i was doing in and it gives me an error when i use plt.legend() i have read in sov that we should use ax.legend but apparently either it didn't work or i wrote it wrong. so i thought i'll just gave the code before i changed it to the ax. my english is not very good so please bear with it. thank you
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_excel("Umur.xlsx")
df.head()
print(df)
a = plt.scatter(df['Umur'],df['Gaji'])
plt.show()
km = KMeans(n_clusters=3)
km
y_predicted = km.fit_predict(df[['Umur','Gaji']])
y_predicted
print(y_predicted)
df['cluster'] = y_predicted
df.head()
print(df)
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Umur,df1['Gaji'],color='green')
plt.scatter(df2.Umur,df2['Gaji'],color='red')
plt.scatter(df3.Umur,df3['Gaji'],color='black')
#plt.scatter(km.cluster_centers_[:,0],km_clusters_centers_[:,1],color='purple',marker='*',label='centroid')
plt.xlabel('Umur')
plt.ylabel('Gaji')
plt.legend ()

I edit three lines and add one line like below:
...
gaji_green = plt.scatter(df1.Umur,df1['Gaji'],color='green')
gaji_red = plt.scatter(df2.Umur,df2['Gaji'],color='red')
gaji_balck = plt.scatter(df3.Umur,df3['Gaji'],color='black')
...
plt.legend((gaji_green, gaji_red, gaji_balck),
('gaji_green', 'gaji_red', 'gaji_balck'),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8)
...
finally, code like below:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_excel("Umur.xlsx")
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Umur','Gaji']])
df['cluster'] = y_predicted
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
gaji_green = plt.scatter(df1.Umur,df1['Gaji'],color='green')
gaji_red = plt.scatter(df2.Umur,df2['Gaji'],color='red')
gaji_balck = plt.scatter(df3.Umur,df3['Gaji'],color='black')
plt.xlabel('Umur')
plt.ylabel('Gaji')
plt.legend ()
plt.legend((gaji_green, gaji_red, gaji_balck),
('gaji_green', 'gaji_red', 'gaji_balck'),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8)
plt.show()

Related

Plot in Seaborn 2 probability denisty functions on one figure

I'm trying to plot two probability density functions on one figure(so that they overlap).
import matplotlib.pyplot as plt
import numpy
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
data = [0,0,1,2,2,2,2,1,2,3,0,5,4,5,4,6,2,2,5,4,6,3,2,5,4,3,7,-1,0]
scaler = MinMaxScaler()
df = pd.DataFrame(data, columns=['Numbers'])
X = numpy.asarray(data)
X=X.reshape(-1,1)
standardized_data = scaler.fit_transform(X)
normal_data = np.random.normal(loc=0.0, scale=1.0, size=len(df))
sns.displot(normal_data, kind='kde')
sns.displot(standardized_data, kind='kde')
plt.show()
sns.displot(
{"normal": normal_data, "standardized": standardized_data.squeeze()},
kind='kde'
)

Overlay a line plot around diffrent region in Seaborn or Matplotlib

I would like to plot a line plot and make different overlay based on condition as illustrated below.
May I know how, or if possible, please kindly redirect me to right material on achieving the intended objective.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)
rng = np.random.default_rng(2)
mlist=[]
for _ in range(4):
m=np.random.rand(4).tolist()
n=rng.integers(0, 6, size=(1)).tolist()*4
df = pd.DataFrame(zip(m,n), columns=['yval','type'])
mlist.append(df)
df=pd.concat(mlist).reset_index(drop=True).reset_index()
sns.lineplot(data=df, x="index", y="yval")
plt.show()
Suggestion using Matplotlib or Seaborn, or any other package are welcome
The filling of the section was achieved using axvspan. I also used text for annotations.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)
rng = np.random.default_rng(2)
mlist=[]
for _ in range(4):
m=np.random.rand(4).tolist()
n=rng.integers(0, 6, size=(1)).tolist()*4
df = pd.DataFrame(zip(m,n), columns=['yval','type'])
mlist.append(df)
df=pd.concat(mlist).reset_index(drop=True).reset_index()
g = sns.lineplot(data=df, x="index", y="yval")
overlay = {0:'m',1:'gray',5:'r'}
for i in np.arange(0,len(df),4):
tmp = df.iloc[i:i+4, :]
v = overlay.get(tmp.type.unique()[0])
g.axvspan(min(tmp.index), max(tmp.index)+1, color=v, alpha=0.3)
g.text(((min(tmp.index)+max(tmp.index)+1) / 2)-1, 0.1,'type {}'.format(tmp.type.unique()[0]), fontsize=12)
plt.show()
Using Matplotlib add_patch and text
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as mcolors
# nn=mcolors.CSS4_COLORS
all_colors=list(mcolors.TABLEAU_COLORS.keys())
b=1
np.random.seed(0)
rng = np.random.default_rng(2)
mlist=[]
for _ in range(4):
m=np.random.rand(4).tolist()
n=rng.integers(0, 6, size=(1)).tolist()*4
df = pd.DataFrame(zip(m,n), columns=['yval','type'])
mlist.append(df)
df=pd.concat(mlist).reset_index(drop=True).reset_index()
# df.to_feather('test.feather')
# df=pd.read_feather('test.feather')
df['C'] = df['type'].diff()
df['C']=df['C'].fillna(10)
nb=df.type[(df['C'] != 0)].to_frame().reset_index()
unique_val=nb['type'].drop_duplicates().sort_values().tolist()
ngroup_type=dict(zip(unique_val,[f'type {idx}' for idx in unique_val]))
nb['ngroup']=nb["type"].map(ngroup_type)
color_group=all_colors[:len(unique_val)]
res = dict(zip(unique_val, color_group))
nb["color"] = nb["type"].map(res)
starting_point=nb["index"].values.tolist()
mcolor=nb["color"].values.tolist()
group_type=nb["ngroup"].values.tolist()
nspace=4
nheight=1
fg=sns.lineplot(data=df, x="index", y="yval")
for ncolor,spoint,gtype in zip(mcolor,starting_point,group_type):
fg.axes.add_patch(patches.Rectangle((spoint, 0),
nspace,nheight,edgecolor = 'blue',
facecolor = ncolor,fill=True,alpha=0.1,ls=':') )
fg.axes.text(spoint+1.5, 0.1, gtype , size=10,
va="baseline", ha="left", multialignment="left")
plt.show()

Trouble creating scatter plot

I'm having trouble using the scatter to create a scatter plot. Can someone help me? I've highlighted the line causing the error:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('vetl8.csv')
df = pd.DataFrame(data=data)
clusterNum = 3
X = df.iloc[:, 1:].values
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
k_means = KMeans(init="k-means++", n_clusters=clusterNum, n_init=12)
k_means.fit(X)
labels = k_means.labels_
df["Labels"] = labels
df.to_csv('dfkmeans.csv')
plt.scatter(df[2], df[1], c=labels) **#Here**
plt.xlabel('K', fontsize=18)
plt.ylabel('g', fontsize=16)
plt.show()
#data set correct
You are close, just a minor adjustment to access the x-y columns by number should fix it:
plt.scatter(df[df.columns[2]], df[df.columns[1]], c=df["Labels"])

IndexError: only integers, slices (`:`)

I am trying to utilize k means clustering in Python and I am encountering an error. The dataset that I am working with can be found here:
https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle
import pandas as pd
import seaborn as sns
import numpy as np
#from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
from sklearn.preprocessing import scale, StandardScaler
import sklearn
from sklearn.cluster import KMeans
df = pd.read_csv('vehicles.csv')
df2 = df[['comb08','youSaveSpend']].copy()
scaler = StandardScaler()
scaler.fit(df2)
scaled_array = scaler.transform(df2)
average = np.mean(scaled_array[:,0])
std = np.std(scaled_array[:,0])
df2 = scaled_array
max_clusters = 10
noClusters = range(1, max_clusters + 1)
kmeans = [KMeans(n_clusters = i) for i in noClusters]
score = [kmeans[i].fit(df6).score(df2) for i in range(len(kmeans))]
plt.plot(noClusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel("Score")
plt.title('Elbow Curve')
kmeans = KMeans(n_clusters = 10, random_state = 0)
kmeans = kmeans.fit(scaled_array)
unscaled = scaler.inverse_transform(kmeans.cluster_centers_)
unscaled
centroids = pd.DataFrame({'centroidx':unscaled[:,0],'centroidy':unscaled[:,1]})
df2['label'] = kmeans.labels_.astype(np.int)
df2.head() # <======== Error Occurs Here
plt.scatter(df2['comb08'], df2['youSaveSpend'], c=df2.label) # (x,y,color)
plt.scatter(centroids['centroidx'], \
centroids['centroidy'], c='red') # (x,y,color)
plt.show() <======== Error also Occurs Here
The error I get is the below:
When I try to convert my floats like so:
df2 = df.apply(pd.to_numeric)
I get this error:

plotting boxplot with sns

I would like to depict the value of my variables found in a dataset in the form of a boxplot. The dataset is the following:
https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)
So far my code is the following:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
df=pd.read_csv(file,names=['id', 'clump_thickness','unif_cell_size',
'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size',
'bare_nuclei', 'bland_chromatin', 'normal_nucleoli','mitoses','Class'])
#boxplot
plt.figure(figsize=(15,10))
names=list(df.columns)
names=names[:-1]
min_max_scaler=preprocessing.MinMaxScaler()
X = df.drop(["Class"],axis=1)
columnsN=list(X.columns)
x_scaled=min_max_scaler.fit_transform(X) #normalization
X[columnsN]=x_scaled
y = df['Class']
sns.set_context('notebook', font_scale=1.5)
sns.boxplot(x=X['unif_cell_size'],y=y,data=df.iloc[:, :-1],orient="h")
My boxplot returns the following figure:
but I would like to display my information like the following graph:
I know that is from a different dataset, but I can see that they have displayed the diagnosis, at the same time, for each feature with their values. I have tried to do it in different ways, but I am not able to do that graph.
I have tried the following:
data_st = pd.concat([y,X],axis=1)
data_st = pd.melt(data_st,id_vars=columnsN,
var_name="X",
value_name='value')
sns.boxplot(x='value', y="X", data=data_st,hue=y,palette='Set1')
plt.legend(loc='best')
but still no results. Any help?
Thanks
Reshape the data with pandas.DataFrame.melt:
Most of the benign (class 2) boxplots are at 0 (scaled) or 1 (unscaled), as they should be
print(df_scaled_melted.groupby(['Class', 'Attributes', 'Values'])['Values'].count().unstack()) after melt, to understand the counts
MinMaxScaler has been used, but is unnecessary in this case, because all of the data values are very close together. If you plot the data without scaling, the plot will look the same, except the y-axis range will be 1 - 10 instead.
This should really only be used in cases when the data is widely diverging, where an attribute will have too much influence with some ML algorithm.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# path to file
p = Path(r'c:\some_path_to_file\breast-cancer-wisconsin.data')
# create dataframe
df = pd.read_csv(p, names=['id', 'clump_thickness','unif_cell_size',
'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size',
'bare_nuclei', 'bland_chromatin', 'normal_nucleoli','mitoses','Class'])
# replace ? with np.NaN
df.replace('?', np.NaN, inplace=True)
# scale the data
min_max_scaler = MinMaxScaler()
df_scaled = pd.DataFrame(min_max_scaler.fit_transform(df.iloc[:, 1:-1]))
df_scaled.columns = df.columns[1:-1]
df_scaled['Class'] = df['Class']
# melt the dataframe
df_scaled_melted = df_scaled.iloc[:, 1:].melt(id_vars='Class', var_name='Attributes', value_name='Values')
# plot the data
plt.figure(figsize=(12, 8))
g = sns.boxplot(x='Attributes', y='Values', hue='Class', data=df_scaled_melted)
for item in g.get_xticklabels():
item.set_rotation(90)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
Without scaling:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
p = Path.cwd() / r'data\breast_cancer\breast-cancer-wisconsin.data'
df = pd.read_csv(p, names=['id', 'clump_thickness','unif_cell_size',
'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size',
'bare_nuclei', 'bland_chromatin', 'normal_nucleoli','mitoses','Class'])
df.replace('?', np.NaN, inplace=True)
df.dropna(inplace=True)
df = df.astype('int')
df_melted = df.iloc[:, 1:].melt(id_vars='Class', var_name='Attributes', value_name='Values')
plt.figure(figsize=(12, 8))
g = sns.boxplot(x='Attributes', y='Values', hue='Class', data=df_melted)
for item in g.get_xticklabels():
item.set_rotation(90)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

Categories

Resources