I am trying to utilize k means clustering in Python and I am encountering an error. The dataset that I am working with can be found here:
https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle
import pandas as pd
import seaborn as sns
import numpy as np
#from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
from sklearn.preprocessing import scale, StandardScaler
import sklearn
from sklearn.cluster import KMeans
df = pd.read_csv('vehicles.csv')
df2 = df[['comb08','youSaveSpend']].copy()
scaler = StandardScaler()
scaler.fit(df2)
scaled_array = scaler.transform(df2)
average = np.mean(scaled_array[:,0])
std = np.std(scaled_array[:,0])
df2 = scaled_array
max_clusters = 10
noClusters = range(1, max_clusters + 1)
kmeans = [KMeans(n_clusters = i) for i in noClusters]
score = [kmeans[i].fit(df6).score(df2) for i in range(len(kmeans))]
plt.plot(noClusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel("Score")
plt.title('Elbow Curve')
kmeans = KMeans(n_clusters = 10, random_state = 0)
kmeans = kmeans.fit(scaled_array)
unscaled = scaler.inverse_transform(kmeans.cluster_centers_)
unscaled
centroids = pd.DataFrame({'centroidx':unscaled[:,0],'centroidy':unscaled[:,1]})
df2['label'] = kmeans.labels_.astype(np.int)
df2.head() # <======== Error Occurs Here
plt.scatter(df2['comb08'], df2['youSaveSpend'], c=df2.label) # (x,y,color)
plt.scatter(centroids['centroidx'], \
centroids['centroidy'], c='red') # (x,y,color)
plt.show() <======== Error also Occurs Here
The error I get is the below:
When I try to convert my floats like so:
df2 = df.apply(pd.to_numeric)
I get this error:
Related
I'm trying to plot two probability density functions on one figure(so that they overlap).
import matplotlib.pyplot as plt
import numpy
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
data = [0,0,1,2,2,2,2,1,2,3,0,5,4,5,4,6,2,2,5,4,6,3,2,5,4,3,7,-1,0]
scaler = MinMaxScaler()
df = pd.DataFrame(data, columns=['Numbers'])
X = numpy.asarray(data)
X=X.reshape(-1,1)
standardized_data = scaler.fit_transform(X)
normal_data = np.random.normal(loc=0.0, scale=1.0, size=len(df))
sns.displot(normal_data, kind='kde')
sns.displot(standardized_data, kind='kde')
plt.show()
sns.displot(
{"normal": normal_data, "standardized": standardized_data.squeeze()},
kind='kde'
)
I just learned python, this is literally my first lesson and i was told to make kmeans with python. and while i was doing in and it gives me an error when i use plt.legend() i have read in sov that we should use ax.legend but apparently either it didn't work or i wrote it wrong. so i thought i'll just gave the code before i changed it to the ax. my english is not very good so please bear with it. thank you
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_excel("Umur.xlsx")
df.head()
print(df)
a = plt.scatter(df['Umur'],df['Gaji'])
plt.show()
km = KMeans(n_clusters=3)
km
y_predicted = km.fit_predict(df[['Umur','Gaji']])
y_predicted
print(y_predicted)
df['cluster'] = y_predicted
df.head()
print(df)
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Umur,df1['Gaji'],color='green')
plt.scatter(df2.Umur,df2['Gaji'],color='red')
plt.scatter(df3.Umur,df3['Gaji'],color='black')
#plt.scatter(km.cluster_centers_[:,0],km_clusters_centers_[:,1],color='purple',marker='*',label='centroid')
plt.xlabel('Umur')
plt.ylabel('Gaji')
plt.legend ()
I edit three lines and add one line like below:
...
gaji_green = plt.scatter(df1.Umur,df1['Gaji'],color='green')
gaji_red = plt.scatter(df2.Umur,df2['Gaji'],color='red')
gaji_balck = plt.scatter(df3.Umur,df3['Gaji'],color='black')
...
plt.legend((gaji_green, gaji_red, gaji_balck),
('gaji_green', 'gaji_red', 'gaji_balck'),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8)
...
finally, code like below:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_excel("Umur.xlsx")
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Umur','Gaji']])
df['cluster'] = y_predicted
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
gaji_green = plt.scatter(df1.Umur,df1['Gaji'],color='green')
gaji_red = plt.scatter(df2.Umur,df2['Gaji'],color='red')
gaji_balck = plt.scatter(df3.Umur,df3['Gaji'],color='black')
plt.xlabel('Umur')
plt.ylabel('Gaji')
plt.legend ()
plt.legend((gaji_green, gaji_red, gaji_balck),
('gaji_green', 'gaji_red', 'gaji_balck'),
scatterpoints=1,
loc='lower left',
ncol=1,
fontsize=8)
plt.show()
I'm having trouble using the scatter to create a scatter plot. Can someone help me? I've highlighted the line causing the error:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('vetl8.csv')
df = pd.DataFrame(data=data)
clusterNum = 3
X = df.iloc[:, 1:].values
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
k_means = KMeans(init="k-means++", n_clusters=clusterNum, n_init=12)
k_means.fit(X)
labels = k_means.labels_
df["Labels"] = labels
df.to_csv('dfkmeans.csv')
plt.scatter(df[2], df[1], c=labels) **#Here**
plt.xlabel('K', fontsize=18)
plt.ylabel('g', fontsize=16)
plt.show()
#data set correct
You are close, just a minor adjustment to access the x-y columns by number should fix it:
plt.scatter(df[df.columns[2]], df[df.columns[1]], c=df["Labels"])
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
ridge_clf = RidgeCV(alphas=alphas,scoring='r2').fit(X, y)
ridge_clf.score(X, y)
I wanted to plot the following graph for RidgeCV. I don't see any option to do that like GridSearhCV. I appreciate your suggestions!
There is no indication what the colors stand for. I assume they stand for features and we investigate the size of each feature weight as function of alpha. Here is my solution:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
w = list()
for a in alphas:
ridge_clf = RidgeCV(alphas=[a],cv=10).fit(X, y)
w.append(ridge_clf.coef_)
w = np.array(w)
plt.semilogx(alphas,w)
plt.title('Ridge coefficients as function of the regularization')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.legend(X.keys())
Output:
Since you only have two features in X there are only two lines.
Here is the code for generating the plot that you had posted.
Firstly, we need to understand that RidgeCV would not return the coef for each alpha value that we had fed in the alphas param.
The motivation behind having the RidgeCV is that it will try for different alpha values mentioned in alphas param, then based on cross validation scoring, it will return the best alpha along with the fitted model.
Hence, the only way to get the coef for each alpha value using cv is iterate through RidgeCV using each alpha value.
Example:
# Author: Fabian Pedregosa -- <fabian.pedregosa#inria.fr>
# License: BSD 3 clause
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)
coefs = []
for a in alphas:
ridge = linear_model.RidgeCV(alphas=[a], fit_intercept=False, cv=3)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('RidgeCV coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
I am using python for k-means clustering for Mnist database(http://yann.lecun.com/exdb/mnist/). I am able to successfully cluster the data but unable to label the clusters. Meaning, I am unable to see that what cluster number holds what digit. For example cluster 5 can hold digit 7.
I need to write a code to correctly label the clusters after the k-means clustering has been done. Also need to add a legend to the code.
from __future__ import division, print_function, absolute_import
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D #only needed for 3D plots
#scikit learn
from sklearn.cluster import KMeans
#pandas to read excel file
import pandas
import xlrd
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
Links:
[MNIST Dataset] http://yann.lecun.com/exdb/mnist/
df = pandas.read_csv('test_encoded_with_label.csv',header=None,
delim_whitespace=True)
#df = pandas.read_excel('test_encoded_with_label.xls')
#print column names
print(df.columns)
df1 = df.iloc[:,0:2] #0 and 1, the last index is not used for iloc
labels = df.iloc[:,2]
labels = labels.values
dataset = df1.values
#train indices - depends how many samples
trainidx = np.arange(0,9999)
testidx = np.arange(0,9999)
train_data = dataset[trainidx,:]
test_data = dataset[testidx,:]
train_labels = labels[trainidx] #just 1D, no :
tpredct_labels = labels[testidx]
kmeans = KMeans(n_clusters=10, random_state=0).fit(train_data)
kmeans.labels_
#print(kmeans.labels_.shape)
plt.scatter(train_data[:,0],train_data[:,1], c=kmeans.labels_)
predct_labels = kmeans.predict(train_data)
print(predct_labels)
print('actual label', tpredct_labels)
centers = kmeans.cluster_centers_
print(centers)
plt.show()
To create markers to find cluster of labelled points, you can use the annotate method
Here is a sample code run on sklearn digits dataset where I try to mark the centroids of the resultant clustering. Note that I just label the clusters from 0-9 just for illustrative purpose:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
np.random.seed(42)
digits = load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target
h = .02
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)
centroids = kmeans.cluster_centers_
plt_data = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_, cmap=plt.cm.get_cmap('Spectral', 10))
plt.colorbar()
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x')
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlabel('component 1')
plt.ylabel('component 2')
labels = ['{0}'.format(i) for i in range(10)]
for i in range (10):
xy=(centroids[i, 0],centroids[i, 1])
plt.annotate(labels[i],xy, horizontalalignment='right', verticalalignment='top')
plt.show()
This is the result you get:
To add the legend, try:
plt.scatter(train_data[:,0], train_data[:,1], c=kmeans.labels_, label=kmeans.labels_)
plt.legend()