Below is a sample of the data set that I am using:
id,product,store,revenue,store_capacity,state
1,Ball,AB,222,1000,CA
1,Pen,AB,234,1452,WD
2,Books,CD,543,888,MA
2,Ink,EF,123,9865,NY
Code is below
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
sns.set(rc={'figure.figsize':(11.7,8.27)})
df = pd.read_csv(r'1.csv',index_col=None)
dummies = pd.get_dummies(data = df)
km = KMeans(n_clusters=2).fit(dummies)
labels = km.predict(dummies)
dummies['cluster_id'] = km.labels_
def distance_to_centroid(row, centroid):
row = row[['id', 'product', 'store', 'revenue','store_capacity', 'state_AL', 'state_CA', 'state_CH',
'state_WD', 'country_India', 'country_Japan', 'country_USA']]
return euclidean(row, centroid)
dummies['distance_to_center0'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[0]),1)
dummies['distance_to_center1'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[1]),1)
dummies['distance_to_center2'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[2]),1)
dummies_df = dummies[['distance_to_center0','distance_to_center1','cluster_id']]
test = {0:"Blue", 1:"Red", 2:"Green"}
sns.scatterplot(x="distance_to_center0", y="distance_to_center1", data=dummies_df, hue="cluster_id", palette = test)
I need to get the centre of each cluster, Code below is getting centroid of each element means what is the distance from each element to the centre point of the cluster
centroids = km.cluster_centers_
centroid_labels = [centroids[i] for i in labels]
centroid_label
I want to get the centre point of each cluster
courtesy #Isma
km = KMeans(n_clusters=7).fit(dummies)
closest, _ = pairwise_distances_argmin_min(km.cluster_centers_, dummies)
closest
Related
I need to cluster data using the Fuzzy C-Means. So, I use fcm from pyclustering.cluster.fcm. So, I would like to know if there is a way to get the labels.
import numpy as np
import pandas as pd
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
import random
coords = [(random.random()*2.0, random.random()*2.0) for _ in range(100)]
dfcluster = pd.DataFrame(coords, columns = ['x','y'])
sample = dfcluster.to_numpy()
# initialize
initial_centers = kmeans_plusplus_initializer(sample, 5, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
# create instance of Fuzzy C-Means algorithm
fcm_instance = fcm(sample, initial_centers)
# run cluster analysis and obtain results
fcm_instance.process()
clusters = fcm_instance.get_clusters()
print(clusters)
I have tried it this way, and it works, but I do not think that it is a perfect answer
import pandas as pd
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
import random
coords = [(random.random()*2.0, random.random()*2.0) for _ in range(100)]
dfcluster = pd.DataFrame(coords, columns = ['x','y'])
sample = dfcluster.to_numpy()
# initialize
initial_centers = kmeans_plusplus_initializer(sample, 5, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
# create instance of Fuzzy C-Means algorithm
fcm_instance = fcm(sample, initial_centers)
# run cluster analysis and obtain results
fcm_instance.process()
clusters = fcm_instance.get_clusters()
cluster=0
dfclusternew = pd.DataFrame(columns = ['cluster','x', 'y'])
for index, i in enumerate(clusters):
for j in i:
dfclusternew = dfclusternew.append(
pd.Series([cluster, dfcluster['x'].iloc[j], dfcluster['y'].iloc[j]], index=['cluster', 'x', 'y']),
ignore_index=True)
cluster += 1
dfcluster =dfclusternew
print(dfcluster)
However, I think I have another way to do that, and it is faster. As the result is the index in every cluster. So, I used loc[df.index[results[i]]
import pandas as pd
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
import random
coords = [(random.random()*2.0, random.random()*2.0) for _ in range(100)]
dfcluster = pd.DataFrame(coords, columns = ['x','y'])
dfcluster['cluster'] = 0
sample = dfcluster.to_numpy()
# initialize
initial_centers = kmeans_plusplus_initializer(sample, 5, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
# create instance of Fuzzy C-Means algorithm
fcm_instance = fcm(sample, initial_centers)
# run cluster analysis and obtain results
fcm_instance.process()
dfcluster.reset_index()
results=fcm_instance.get_clusters()
for i in range(len(results)):
dfcluster.loc[dfcluster.index[results[i]], 'cluster'] = i
print(dfcluster)
I want to count the points above the least squares fits.
from sklearn.mixture import GaussianMixture
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from astropy.io import ascii
from scipy.stats import norm
from astropy.timeseries import LombScargle
from astropy import stats
data4= pd.read_csv('Standard Dev main pop.csv')
names4 = data4.columns
df3 = pd.DataFrame(data4, columns=names4)
df3.head()
#print(c)
data5= pd.read_csv('2 Sigma Main pop.csv')
names5 = data5.columns
df5 = pd.DataFrame(data5, columns=names5)
df5.head()
data6= pd.read_csv('3 Sigma main pop.csv')
names6 = data6.columns
df6 = pd.DataFrame(data6, columns=names6)
df6.head()
a=df5['Mean Mag']
b=df5['Std']
c=df6['Mean Mag']
d=df6['Std']
e=df3['Mean Mag']
f=df3['Std']
ax=plt.scatter(e,f, label=' All sources')
#ay=plt.scatter(c,d, label='3 Sigma from Median Std')
lstsq_coefs = np.polyfit(a, b, deg=2)
lstsq_preds = lstsq_coefs[0]*a**2 + lstsq_coefs[1]*a + lstsq_coefs[2]
plt.plot(a, lstsq_preds, linestyle="dashed", color="red", label="Least squares 2 sigma")
#ay=plt.scatter(c,d, label='3 Sigma from Median Std')
lstsq_coefs1 = np.polyfit(c ,d, deg=2)
lstsq_preds1 = lstsq_coefs1[0]*c**2 + lstsq_coefs1[1]*c + lstsq_coefs1[2]
plt.plot(c, lstsq_preds1, linestyle="dashed", color="black", label="Least squares 3 sigma")
plt.legend(loc='best',fontsize= 16)
plt.gcf().set_size_inches((12,10))
plt.ylim(0,0.1)
plt.show()
I want to count the number of points that lie above each least-squares fit. I have tried some extremely tedious methods which is not feasible in the long run.
You can compare them using numpy module:
import numpy as np
f = np.array(f)
lstsq_preds = np.array(lstsq_preds)
lstsq_preds1 = np.array(lstsq_preds1)
print("Number above least squares #1:", len(f[f > lstsq_preds]))
print("Number above least squares #2:", len(f[f > lstsq_preds]))
Note that, I just transform the arrays to numpy to make sure they are numpy. It might be unnecessary to use these transforming lines since you are dealing with pandas dataframe.
I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.
I have wrote a code that will give me the best number of clusters based on max value of silhouette_score. Now I want to find out how many values each cluster have. For example, my result is that the optimal number of clusters is 3, I want to find out how many values each cluster have, for example first cluster has 1241 values second 3134 values and third 351 values.
Is it possible to do something like that?
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.decomposition import PCA
df = pd.read_csv('CNN Comments.csv')
df = df.head(8000)
#print(df)
x = df['Text Data']
cv = TfidfVectorizer(analyzer = 'word',max_features = 10000, preprocessor=None, lowercase=True, tokenizer=None, stop_words = 'english')
#cv = CountVectorizer(analyzer = 'word', max_features = 8000, preprocessor=None, lowercase=True, tokenizer=None, stop_words = 'english')
x = cv.fit_transform(x)
my_list = []
list_of_clusters = []
for i in range(2,5):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(x)
my_list.append(kmeans.inertia_)
cluster_labels = kmeans.fit_predict(x)
silhouette_avg = silhouette_score(x, cluster_labels) * 100
print(round(silhouette_avg,2))
list_of_clusters.append(round(silhouette_avg, 1))
plt.plot(range(2,5),my_list)
plt.show()
number_of_clusters = max(list_of_clusters)
number_of_clusters = list_of_clusters.index(number_of_clusters)+2
print('Number of clusters: ', number_of_clusters)
You can use the array assigned to cluster_labels to get the distribution of cluster assignments. I would recommend using Counter from the collections module.
from collections import Counter
...
cluster_labels = kmeans.fit_predict(x)
cluster_counts = Counter(cluster_labels)
The alternativ with numpy:
import numpy as np
...
unique, counts = np.unique(kmeans.fit_predict(x), return_counts=True)
print(dict(zip(unique, counts)))
I try to run a t-sne but python shows me this error:
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
Data is being provided by this link.
Here's the code:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
#scatter plot the sample points among 5 classes
markers=('s','d','o','^','v')
color_map = {0:'red', 1:'blue', 2:'lightgreen', 3:'purple', 4:'cyan'}
plt.figure()
for idx, cl in enumerate(np.unique(x_test_2d)):
plt.scatter(x=x_test_2d[cl, 0],y =x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx], label=cl)
plt.show()
What do I have to change in order to make this work?
The error is due to the following line:
plt.scatter(x_test_2d[cl, 0], x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx])
Here, cl can take and takes not integer values (from np.unique(x_test_2d)) and this raises the error, e.g. the last value that cl takes is 99.46295 and then you use: x_test_2d[cl, 0] which translates into x_test_2d[99.46295, 0]
Define a variable y that hold the class labels, then use:
# variable holding the classes
y = dataframe_all.classe.values
y = np.array([ord(i) for i in y])
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()
FULL CODE:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
# variable holding the classes
y = dataframe_all.classe.values # you need this for the colors
y = np.array([ord(i) for i in y]) # convert letters to numbers
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()