I'm having trouble using the scatter to create a scatter plot. Can someone help me? I've highlighted the line causing the error:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('vetl8.csv')
df = pd.DataFrame(data=data)
clusterNum = 3
X = df.iloc[:, 1:].values
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
k_means = KMeans(init="k-means++", n_clusters=clusterNum, n_init=12)
k_means.fit(X)
labels = k_means.labels_
df["Labels"] = labels
df.to_csv('dfkmeans.csv')
plt.scatter(df[2], df[1], c=labels) **#Here**
plt.xlabel('K', fontsize=18)
plt.ylabel('g', fontsize=16)
plt.show()
#data set correct
You are close, just a minor adjustment to access the x-y columns by number should fix it:
plt.scatter(df[df.columns[2]], df[df.columns[1]], c=df["Labels"])
Related
I just learned python, this is literally my first lesson and i was told to make kmeans with python. and while i was doing in and it gives me an error when i use plt.legend() i have read in sov that we should use ax.legend but apparently either it didn't work or i wrote it wrong. so i thought i'll just gave the code before i changed it to the ax. my english is not very good so please bear with it. thank you
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_excel("Umur.xlsx")
df.head()
print(df)
a = plt.scatter(df['Umur'],df['Gaji'])
plt.show()
km = KMeans(n_clusters=3)
km
y_predicted = km.fit_predict(df[['Umur','Gaji']])
y_predicted
print(y_predicted)
df['cluster'] = y_predicted
df.head()
print(df)
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Umur,df1['Gaji'],color='green')
plt.scatter(df2.Umur,df2['Gaji'],color='red')
plt.scatter(df3.Umur,df3['Gaji'],color='black')
#plt.scatter(km.cluster_centers_[:,0],km_clusters_centers_[:,1],color='purple',marker='*',label='centroid')
plt.xlabel('Umur')
plt.ylabel('Gaji')
plt.legend ()
I edit three lines and add one line like below:
...
gaji_green = plt.scatter(df1.Umur,df1['Gaji'],color='green')
gaji_red = plt.scatter(df2.Umur,df2['Gaji'],color='red')
gaji_balck = plt.scatter(df3.Umur,df3['Gaji'],color='black')
...
plt.legend((gaji_green, gaji_red, gaji_balck),
('gaji_green', 'gaji_red', 'gaji_balck'),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8)
...
finally, code like below:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_excel("Umur.xlsx")
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Umur','Gaji']])
df['cluster'] = y_predicted
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
gaji_green = plt.scatter(df1.Umur,df1['Gaji'],color='green')
gaji_red = plt.scatter(df2.Umur,df2['Gaji'],color='red')
gaji_balck = plt.scatter(df3.Umur,df3['Gaji'],color='black')
plt.xlabel('Umur')
plt.ylabel('Gaji')
plt.legend ()
plt.legend((gaji_green, gaji_red, gaji_balck),
('gaji_green', 'gaji_red', 'gaji_balck'),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8)
plt.show()
I am trying to use TSNE to visualize data based on a Category to show me if the data is separable.
I have been trying to do this for the past two days but I am not getting a scatter plot showing the different categories plotted to enable me to see the relationship.
Instead, it is plotting all the data in a straight linear line, which cannot be correct as there are 5 different distinct attributes with the column I am trying to use as a label and legend.
What do I do to correct this?
import label as label
import pandas as pd
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import numpy as np
# #region Loading Data
filename = 'Dataset/test.csv'
df = pd.read_csv(filename)
label = df.pop('Activity')
label_counts = label.value_counts()
# # Scale Data
scale = StandardScaler()
tsne_data= scale.fit_transform(df)
fig, axa = plt.subplots(2, 1, figsize=(15,10))
group = label.unique()
for i , labels in label.iteritems():
# mask =(label = group)
axa[0].scatter(x = tsne_data, y = tsne_data, label = group)
plt.legend
plt.show()
There is my code:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn import datasets
data = datasets.load_iris(return_X_y=False)
X = data.data
y = data.target
names = data.feature_names
target_names = data.target_names
columns=names+['target']
df = pd.DataFrame(np.hstack([X, y.reshape(-1,1)]), columns=columns)
df.loc[df.target==0, 'target_names'] = 'setosa'
df.loc[df.target==1, 'target_names'] = 'versicolor'
df.loc[df.target==2, 'target_names'] = 'virginica'
indexes = df.index.tolist()
fig,axes = plt.subplots(2,2,figsize=(12,8))
axes[0,0].scatter(indexes,df['sepal length (cm)'],c=y)
axes[0,1].scatter(indexes,df['sepal width (cm)'],c=y)
axes[1,0].scatter(indexes,df['petal length (cm)'],c=y)
axes[1,1].scatter(indexes,df['petal width (cm)'],c=y)
plt.show()
How to add legend to each scatter, where each item is value of y ?
As far as I understand there is no direct way of making the scatter with tags on each data point.
This answer suggests iterating over your data points and labels, once you have created the scatter plots:
for i, txt in enumerate(y):
axes[0,0].annotate(txt, (indexes[i], df['sepal length (cm)'][i]))
...
You can look at formatting options here.
I would like to depict the value of my variables found in a dataset in the form of a boxplot. The dataset is the following:
https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)
So far my code is the following:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
df=pd.read_csv(file,names=['id', 'clump_thickness','unif_cell_size',
'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size',
'bare_nuclei', 'bland_chromatin', 'normal_nucleoli','mitoses','Class'])
#boxplot
plt.figure(figsize=(15,10))
names=list(df.columns)
names=names[:-1]
min_max_scaler=preprocessing.MinMaxScaler()
X = df.drop(["Class"],axis=1)
columnsN=list(X.columns)
x_scaled=min_max_scaler.fit_transform(X) #normalization
X[columnsN]=x_scaled
y = df['Class']
sns.set_context('notebook', font_scale=1.5)
sns.boxplot(x=X['unif_cell_size'],y=y,data=df.iloc[:, :-1],orient="h")
My boxplot returns the following figure:
but I would like to display my information like the following graph:
I know that is from a different dataset, but I can see that they have displayed the diagnosis, at the same time, for each feature with their values. I have tried to do it in different ways, but I am not able to do that graph.
I have tried the following:
data_st = pd.concat([y,X],axis=1)
data_st = pd.melt(data_st,id_vars=columnsN,
var_name="X",
value_name='value')
sns.boxplot(x='value', y="X", data=data_st,hue=y,palette='Set1')
plt.legend(loc='best')
but still no results. Any help?
Thanks
Reshape the data with pandas.DataFrame.melt:
Most of the benign (class 2) boxplots are at 0 (scaled) or 1 (unscaled), as they should be
print(df_scaled_melted.groupby(['Class', 'Attributes', 'Values'])['Values'].count().unstack()) after melt, to understand the counts
MinMaxScaler has been used, but is unnecessary in this case, because all of the data values are very close together. If you plot the data without scaling, the plot will look the same, except the y-axis range will be 1 - 10 instead.
This should really only be used in cases when the data is widely diverging, where an attribute will have too much influence with some ML algorithm.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# path to file
p = Path(r'c:\some_path_to_file\breast-cancer-wisconsin.data')
# create dataframe
df = pd.read_csv(p, names=['id', 'clump_thickness','unif_cell_size',
'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size',
'bare_nuclei', 'bland_chromatin', 'normal_nucleoli','mitoses','Class'])
# replace ? with np.NaN
df.replace('?', np.NaN, inplace=True)
# scale the data
min_max_scaler = MinMaxScaler()
df_scaled = pd.DataFrame(min_max_scaler.fit_transform(df.iloc[:, 1:-1]))
df_scaled.columns = df.columns[1:-1]
df_scaled['Class'] = df['Class']
# melt the dataframe
df_scaled_melted = df_scaled.iloc[:, 1:].melt(id_vars='Class', var_name='Attributes', value_name='Values')
# plot the data
plt.figure(figsize=(12, 8))
g = sns.boxplot(x='Attributes', y='Values', hue='Class', data=df_scaled_melted)
for item in g.get_xticklabels():
item.set_rotation(90)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
Without scaling:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
p = Path.cwd() / r'data\breast_cancer\breast-cancer-wisconsin.data'
df = pd.read_csv(p, names=['id', 'clump_thickness','unif_cell_size',
'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size',
'bare_nuclei', 'bland_chromatin', 'normal_nucleoli','mitoses','Class'])
df.replace('?', np.NaN, inplace=True)
df.dropna(inplace=True)
df = df.astype('int')
df_melted = df.iloc[:, 1:].melt(id_vars='Class', var_name='Attributes', value_name='Values')
plt.figure(figsize=(12, 8))
g = sns.boxplot(x='Attributes', y='Values', hue='Class', data=df_melted)
for item in g.get_xticklabels():
item.set_rotation(90)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
Currently I have the following code
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix
df= pd.read_csv(file, sep=',')
colors = list('r' if i==1 else 'b' for i in df['class']) # class is either 1 or 0
plt.figure()
scatter_matrix(df, color=colors )
plt.show()
It shows the following output
But in this plot on diagonals, instead of simple histogram I want to show stacked histogram like the following such that for class '1' it is red and for '0' it is blue
Please guide me how can I do this ?
The use of seaborn is probably highly beneficial for plotting a scatter matrix kind of plot. However, I do not know how to plot a stacked histogram easily into the diagonal of a PairGrid in seaborn.
As the question anyways asks for matplotlib, the following is a solution using pandas and matplotlib. Unfortunately it will require to do a lot of stuff by hand. The following would be an example (note that seaborn is only imported to get some data since the question did not provide any).
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# seaborn import just needed to get some data
import seaborn as sns
df = sns.load_dataset("iris")
n_hist = 10
category = "species"
columns = ["sepal_length","sepal_width","petal_length","petal_width"]
mi = df[columns].values.min()
ma = df[columns].values.max()
hist_bins = np.linspace(mi, ma, n_hist)
fig, axes = plt.subplots(nrows=len(columns), ncols=len(columns),
sharex="col")
for i,row in enumerate(columns):
for j,col in enumerate(columns):
ax= axes[i,j]
if i == j:
# diagonal
mi = df[col].values.min()
ma = df[col].values.max()
hist_bins = np.linspace(mi, ma, n_hist)
def hist(x):
h, e = np.histogram(x.dropna()[col], bins=hist_bins)
return pd.Series(h, e[:-1])
b = df[[col,category]].groupby(category).apply(hist).T
values = np.cumsum(b.values, axis=1)
for k in range(len(b.columns)):
if k == 0:
ax.bar(b.index, values[:,k], width=np.diff(hist_bins)[0])
else:
ax.bar(b.index, values[:,k], width=np.diff(hist_bins)[0],
bottom=values[:,k-1])
else:
# offdiagonal
for (n,cat) in df.groupby(category):
ax.scatter(cat[col],cat[row], s = 5,label=n, )
ax.set_xlabel(col)
ax.set_ylabel(row)
#ax.legend()
plt.tight_layout()
plt.show()
Sample code
import seaborn as sns
sns.set(style="ticks")
df = sns.load_dataset("iris")
sns.pairplot(df, hue="species")