i am using DBSCAN for clustering data so I can label the data which is anomalous here is my code I wanted to print 1 in front of outliers records in my csv file but for now my code is just telling the record no and printing those records which are outliers
data wrangling
import pandas as pd# visualization
import matplotlib.pyplot as plt# algorithm
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
# import data
df = pd.read_csv("C:/Users/user1/Desktop/4005_20200101_20200331.csv")
print(df.head())
setting up data to cluster
X = df
scale and standardizing data
X = StandardScaler().fit_transform(X)
#Instantiating our DBSCAN Model. In the code below, epsilon = 3 and min_samples is the minimum number of points needed to constitute a cluster.
instantiating DBSCAN
dbscan = DBSCAN(eps=3, min_samples=4)
fitting model
model = dbscan.fit(X)
#Storing the labels formed by the DBSCAN
labels = model.labels_
#Identifying which points make up our “core points”
import numpy as np
from sklearn import metrics
identify core samples
core_samples = np.zeros_like(labels, dtype=bool)
core_samples[dbscan.core_sample_indices_] = True
print(core_samples)
#Calculating the number of clusters
declare the number of clusters
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(n_clusters)
#Computing the Silhouette Score
#print("Silhoette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)
outliers = df[model.labels_ == -1]
print(outliers)
I wanted to print 1 in front of outliers records in my csv file
I found some code on SO which seems to work quite well.
This code, directly below, produces the plot, also below.
from sklearn import datasets
from sklearn import cluster
import plotly
plotly.offline.init_notebook_mode()
iris = datasets.load_iris()
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(iris.data[:,0:1])
data = [plotly.graph_objs.Scatter(x=iris.data[:,0],
y=iris.data[:,1],
mode='markers',
marker=dict(color=kmeans.labels_)
)]
plotly.offline.iplot(data)
Now, I make a simple substitution in the code, to point to my own data, like this.
from sklearn import datasets
from sklearn import cluster
import plotly
plotly.offline.init_notebook_mode()
x = df[['Spend']]
y = df[['Revenue']]
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(x,y)
data = [plotly.graph_objs.Scatter(x=df[['Spend']],
y=df[['Revenue']],
mode='markers',
marker=dict(color=kmeans.labels_))]
plotly.offline.iplot(data)
That gives me this plot.
Here is my data frame.
# Import pandas library
import pandas as pd
# initialize list of lists
data = [[110,'CHASE CENTER',53901,8904,44997,4], [541,'METS STADIUM',57999,4921,53078,1], [538,'DEN BRONCOS',91015,9945,81070,1], [640,'LAMBEAU WI',76214,5773,70441,3], [619,'SAL AIRPORT',93000,8278,84722,5]]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Location', 'Location_Description', 'Revenue','Spend','Profit_Or_Loss','cluster_number'])
# print dataframe.
df
I must be missing something silly, but I don't see what it is.
You have a problem with the dimension:
# In the iris dataset
>>> iris.data[:,0].shape
(150,)
# Your data
>>> x.shape
(5, 1)
# You need to flatter your array
x.values.flatten().shape
(5,)
For example:
from sklearn import datasets
from sklearn import cluster
import plotly
plotly.offline.init_notebook_mode()
x = df[['Spend']]
y = df[['Revenue']]
x_flat = x.values.flatten()
y_flat = y.values.flatten()
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(x)
data = [plotly.graph_objs.Scatter(x=x_flat,
y=y_flat,
mode='markers',
marker=dict(color=kmeans.labels_))]
plotly.offline.iplot(data)
On the other hand cluster.KMeans.fit accepts an array (and not two as you are passing). You're going to have to convert them to something of of shape (n_samples, n_features):
X = np.zeros((x_flat.shape[0], 2))
X[:, 0] = x_flat
X[:, 1] = y_flat
# X.shape -> (5, 2)
kmeans = cluster.KMeans(n_clusters=5, random_state=42).fit(X)
I'm getting the following error from my code:
ValueError: Expected 2D array, got scalar array instead:
array=99.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Here is the code used:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
Physical_activity_df = pd.read_excel('C:/Users/Usuario/Desktop/LW_docs/Physical_activity_nopass.xlsx')
prediction_df = Physical_activity_df[['Activity_Score','Calories']]
prediction_df.plot(kind='scatter', x= 'Activity_Score', y= 'Calories')
plt.show()
#change to df variables
activity_score = pd.DataFrame(prediction_df['Activity_Score'])
calories = pd.DataFrame(prediction_df['Calories'])
lm = linear_model.LinearRegression()
model = lm.fit(activity_score,calories)
#predict new values for calories (FROM HERE COMES THE ERROR)
activity_score_new = 99
calories_predict = model.predict(activity_score_new)
calories_predict
Any idea about how to fix this issue? Thanks!
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
# Read the data.
data = np.asarray(pd.read_csv('data.csv', header=None))
# Assign the features to the variable X, and the labels to the variable y.
X = data[:,0:2]
y = data[:,2]
# TODO: Create the model and assign it to the variable model.
# Find the right parameters for this model to achieve 100% accuracy on the dataset.
model = SVC()
model.fit(X,y)
2 Questions:
the data goes into a numpy array from a pandas Dataframe (by pd.read_csv).
Is that better? Is there a good reason for that? why not stay with the DataFrame?
I do not understand this notation:
X = data[:,0:2]
y = data[:,2]
What does it do?
Thank you.
The data consists of a CSV file with many rows like this:
0.28917,0.65643,0
It includes three columns, the first 2 comprising of the coordinates of the points, and the third one of the label.
Trying to accomplish K-Means in Python using aggregated data files. For example, instead of a data frame with 3 records represented by 3 rows, one row will represent all 3 with a column like cnt (arbitrarily named) representing those 3 unique instances with the number 3 in it.
Below is a set of some basic starter code that does NOT use the aggregated representation of the rows. Please let me know if you would like for me to post the .csv too, but it should be pretty basic:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
data = pd.read_csv('../Data/wholesale_data.csv')
data.head()
categorical_features = ['Channel', 'Region']
continuous_features = ['Fresh', 'Milk', 'Grocery', 'Frozen',
'Detergents_Paper', 'Delicassen']
for col in categorical_features: #for each categorical col
dummies = pd.get_dummies(data[col], prefix=col) #one-hot-encoding
data = pd.concat([data, dummies], axis=1) #append to data
data.drop(col, axis=1, inplace=True) #drop orig column
data.head()
mms = MinMaxScaler()
mms.fit(data)
data_transformed = mms.transform(data)
sum_of_squared_distances = []
K = range(1,15)
for k in K:
km = KMeans(n_clusters=k) #init model
km = km.fit(data_transformed) #fit model
sum_of_squared_distances.append(km.inertia_) #overall SSE
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()