import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
ridge_clf = RidgeCV(alphas=alphas,scoring='r2').fit(X, y)
ridge_clf.score(X, y)
I wanted to plot the following graph for RidgeCV. I don't see any option to do that like GridSearhCV. I appreciate your suggestions!
There is no indication what the colors stand for. I assume they stand for features and we investigate the size of each feature weight as function of alpha. Here is my solution:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
w = list()
for a in alphas:
ridge_clf = RidgeCV(alphas=[a],cv=10).fit(X, y)
w.append(ridge_clf.coef_)
w = np.array(w)
plt.semilogx(alphas,w)
plt.title('Ridge coefficients as function of the regularization')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.legend(X.keys())
Output:
Since you only have two features in X there are only two lines.
Here is the code for generating the plot that you had posted.
Firstly, we need to understand that RidgeCV would not return the coef for each alpha value that we had fed in the alphas param.
The motivation behind having the RidgeCV is that it will try for different alpha values mentioned in alphas param, then based on cross validation scoring, it will return the best alpha along with the fitted model.
Hence, the only way to get the coef for each alpha value using cv is iterate through RidgeCV using each alpha value.
Example:
# Author: Fabian Pedregosa -- <fabian.pedregosa#inria.fr>
# License: BSD 3 clause
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)
coefs = []
for a in alphas:
ridge = linear_model.RidgeCV(alphas=[a], fit_intercept=False, cv=3)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('RidgeCV coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
Related
I'm trying to plot two probability density functions on one figure(so that they overlap).
import matplotlib.pyplot as plt
import numpy
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
data = [0,0,1,2,2,2,2,1,2,3,0,5,4,5,4,6,2,2,5,4,6,3,2,5,4,3,7,-1,0]
scaler = MinMaxScaler()
df = pd.DataFrame(data, columns=['Numbers'])
X = numpy.asarray(data)
X=X.reshape(-1,1)
standardized_data = scaler.fit_transform(X)
normal_data = np.random.normal(loc=0.0, scale=1.0, size=len(df))
sns.displot(normal_data, kind='kde')
sns.displot(standardized_data, kind='kde')
plt.show()
sns.displot(
{"normal": normal_data, "standardized": standardized_data.squeeze()},
kind='kde'
)
I am trying to utilize k means clustering in Python and I am encountering an error. The dataset that I am working with can be found here:
https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle
import pandas as pd
import seaborn as sns
import numpy as np
#from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
from sklearn.preprocessing import scale, StandardScaler
import sklearn
from sklearn.cluster import KMeans
df = pd.read_csv('vehicles.csv')
df2 = df[['comb08','youSaveSpend']].copy()
scaler = StandardScaler()
scaler.fit(df2)
scaled_array = scaler.transform(df2)
average = np.mean(scaled_array[:,0])
std = np.std(scaled_array[:,0])
df2 = scaled_array
max_clusters = 10
noClusters = range(1, max_clusters + 1)
kmeans = [KMeans(n_clusters = i) for i in noClusters]
score = [kmeans[i].fit(df6).score(df2) for i in range(len(kmeans))]
plt.plot(noClusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel("Score")
plt.title('Elbow Curve')
kmeans = KMeans(n_clusters = 10, random_state = 0)
kmeans = kmeans.fit(scaled_array)
unscaled = scaler.inverse_transform(kmeans.cluster_centers_)
unscaled
centroids = pd.DataFrame({'centroidx':unscaled[:,0],'centroidy':unscaled[:,1]})
df2['label'] = kmeans.labels_.astype(np.int)
df2.head() # <======== Error Occurs Here
plt.scatter(df2['comb08'], df2['youSaveSpend'], c=df2.label) # (x,y,color)
plt.scatter(centroids['centroidx'], \
centroids['centroidy'], c='red') # (x,y,color)
plt.show() <======== Error also Occurs Here
The error I get is the below:
When I try to convert my floats like so:
df2 = df.apply(pd.to_numeric)
I get this error:
I have written a simple K-mean algorithm, But I am finding difficulty to explore it cluster by cluster.
Github Link: https://github.com/AkshayBayas/Machine-learning-/blob/master/K-Means%20algorithm.ipynb
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%pylab
Df = pd.read_csv('Kdata.csv')
from sklearn.cluster import KMeans
KModule = KMeans()
K_model = KModule.fit(Df)
K_result = K_model.predict(Df)
centers = K_model.cluster_centers_
K_model.labels_
plt.scatter (x1,x2, c = K_model.labels_, cmap = 'rainbow' )
Can anyone help?
No idea what you mean by "explore cluster by cluster".
If you don't specify the number of clusters, by default it is 8, so if you start with 3 like the code below, you can separate them. Also you need to set it as categoric, the cluster, so it will not be colored on a continuous scale:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
Df = pd.read_csv('Kdata.csv')
from sklearn.cluster import KMeans
KModule = KMeans(n_clusters=3)
K_model = KModule.fit(Df)
K_result = K_model.predict(Df)
Df['cluster'] = pd.Categorical(K_model.labels_)
sns.scatterplot("V1","V2",data=Df,hue='cluster',cmap = 'rainbow' )
Df.plot.scatter("V1","V2",c='cluster',cmap = 'rainbow')
I am working on fuzzy c-means clustering of iris dataset, however can not visualize due to some errors.Using this tutorial I wrote the following for the iris, however it shows error called "AttributeError: shape". This is my code:
from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as sm
import skfuzzy as fuzz
iris = datasets.load_iris()
x = pd.DataFrame(iris.data, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
y = pd.DataFrame(iris.target, columns=['Target'])
plt.figure(figsize=(6, 3))
model =fuzz.cluster.cmeans(iris,3,2,error=0.005,maxiter=1000,init=None,seed=None)
model.fit(x)
plt.show()
I assumed that passing the parameter in the variable model would be enough, however it shows above error. If possible, could you show where I make a mistake ? How to fix this ? I really appreciate your help!
I tried pre-processing the data first, I created a good plot, I simply followed the tutorials, and I perform SVD to reduce the dimension into two, then I started to plot, it seems that for the tutorials you only need two dimensions (x,y). You do not need to do the model.fit() I have not found this kind of command in the documentation, here is the code:
import numpy as np, pandas as pd, os
import matplotlib
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import [![TruncatedSVD
from skle][1]][1]arn.preprocessing import Normalizer
import skfuzzy as fuzz
from sklearn import datasets
################################################################################
iris = datasets.load_iris()
x = pd.DataFrame(iris.data, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
y = pd.DataFrame(iris.target, columns=['Target'])
scaler = StandardScaler()
X_std = scaler.fit_transform(x)
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(X_std)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
a= pd.DataFrame(dtm_lsa, columns = ["component_1","component_2"])
a['targets']=y
fig1, axes1 = plt.subplots(3, 3, figsize=(8, 8))
alldata = np.vstack((a['component_1'], a['component_2']))
fpcs = []
colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen']
for ncenters, ax in enumerate(axes1.reshape(-1), 2):
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
alldata, ncenters, 2, error=0.005, maxiter=1000, init=None)
# Store fpc values for later plots
fpcs.append(fpc)
# Plot assigned clusters, for each data point in training set
cluster_membership = np.argmax(u, axis=0)
for j in range(ncenters):
ax.plot(a['component_1'][cluster_membership == j],
a['component_2'][cluster_membership == j], '.', color=colors[j])
# Mark the center of each fuzzy cluster
for pt in cntr:
ax.plot(pt[0], pt[1], 'rs')
ax.set_title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc))
ax.axis('off')
fig1.tight_layout()
fig1.savefig('iris_dataset.png')
I generated some data y with linear relationship to log(x). I put y and x in a dataframe, sorted by the value of x, fitted model, and then tried to plot data points along with fitted line. However, what I got was a very messy fitted line. I must have done something wrong. This can be easily done in R, but with statsmodels.... I still cannot figure out why. Help needed. Thanks in advance.
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
%matplotlib inline
B0 = 3
B1 = 0.5
X = np.random.rand(1000)
epsilon = np.random.normal(0,0.1, size=1000)
y=B0 + B1*np.log(X)+epsilon
df1 = pd.DataFrame({'Y':y, 'X':X})
df1.sort_values('X', inplace=True)
model1 = smf.ols ('Y~np.log(X)', data=df1).fit()
plt.scatter(df1.X, df1.Y)
plt.plot(df1.X, model1.predict(np.log(df1.X)), 'r-')
This is what I got: