plt.plot draws multiple curves instad of single curve - python
here is the link to the dataset I used: Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#Lets begin with polynomial regression
df = pd.read_excel('enes.xlsx', index='hacim')
X=pd.DataFrame(df['hacim'])
Y=pd.DataFrame(df['delay'])
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, Y)
plt.scatter(X, Y, color = 'red')
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')
plt.title('X Vs Y')
plt.xlabel('hacim')
plt.ylabel('delay')
plt.show()
Last plt.show shows a graph where there are many lines instead of a 1 lined polynomial regression i desired. what wrong and how can ı fix this?
Data
,hacim,delay
0,815,1.44
1,750,1.11
2,321,2.37
3,1021,1.44
4,255,1.09
5,564,1.61
6,1455,15.27
7,525,2.7
8,1118,106.98
9,1036,3.47
10,396,1.34
11,1485,21.49
12,1017,12.22
13,1345,2.72
14,312,1.71
15,742,33.79
16,1100,39.62
17,1445,4.88
18,847,1.55
19,991,1.82
20,1296,10.77
21,854,1.81
22,1198,61.9
23,1162,8.22
24,1463,42.25
25,1272,4.31
26,745,2.36
27,521,2.14
28,1247,94.33
29,732,12.55
30,489,1.05
31,1494,12.78
32,591,3.18
33,257,1.18
34,602,4.24
35,335,2.06
36,523,3.63
37,752,7.61
38,349,1.76
39,771,0.79
40,855,39.08
41,948,3.95
42,1378,97.28
43,598,2.69
44,558,1.67
45,634,34.69
46,1146,12.22
47,1087,1.74
48,628,1.03
49,711,3.34
50,1116,7.27
51,748,1.09
52,1212,14.16
53,434,1.42
54,1046,8.25
55,568,1.33
56,894,2.61
57,1041,4.79
58,801,1.84
59,1387,11.5
60,1171,161.21
61,734,2.43
62,1471,17.42
63,461,1.42
64,751,2.36
65,898,2.4
66,593,1.74
67,942,3.39
68,825,1.09
69,715,20.23
70,725,5.43
71,1128,7.57
72,1348,4.49
73,1393,9.77
74,1379,97.76
75,859,2.59
76,612,15.98
77,1495,8.22
78,887,1.85
79,867,38.65
80,1353,1.6
81,851,60.25
82,1079,24.05
83,1100,25.58
84,638,1.23
85,1115,1.94
86,1443,4.79
87,1421,10.33
88,1279,7.29
89,1176,173.44
90,315,1.53
91,1019,34.03
92,1337,48.67
93,576,28.83
94,919,2.88
95,361,1.5
96,989,1.47
97,1286,32.11
Let's use pandas plot it is much easier:
X=pd.DataFrame(df['hacim'])
Y=pd.DataFrame(df['delay'])
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, Y)
df['y_pred'] = lin_reg_2.predict(poly_reg.fit_transform(X))
df = df.sort_values('hacim')
ax = df.plot.scatter('hacim','delay')
df.plot('hacim', 'y_pred', ax=ax, color='r')
plt.title('X Vs Y')
plt.xlabel('hacim')
plt.ylabel('delay')
plt.show()
Output:
The root of the scatter lines was unsorted data when plotting line graph.
You could do this:
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue', marker='o', linestyle='none')
Output:
Related
Trouble creating scatter plot
I'm having trouble using the scatter to create a scatter plot. Can someone help me? I've highlighted the line causing the error: import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import KMeans import numpy as np from sklearn.preprocessing import StandardScaler data = pd.read_csv('vetl8.csv') df = pd.DataFrame(data=data) clusterNum = 3 X = df.iloc[:, 1:].values X = np.nan_to_num(X) Clus_dataSet = StandardScaler().fit_transform(X) k_means = KMeans(init="k-means++", n_clusters=clusterNum, n_init=12) k_means.fit(X) labels = k_means.labels_ df["Labels"] = labels df.to_csv('dfkmeans.csv') plt.scatter(df[2], df[1], c=labels) **#Here** plt.xlabel('K', fontsize=18) plt.ylabel('g', fontsize=16) plt.show() #data set correct
You are close, just a minor adjustment to access the x-y columns by number should fix it: plt.scatter(df[df.columns[2]], df[df.columns[1]], c=df["Labels"])
How to plot SciKit-Learn linear regression graph
I am new to SciKit-Learn and I have been working on a regression problem (king county csv) on kaggle. I have been training a regression model to predict the price of the house and I wanted to plot the graph but I have no idea how to do so. I am using python 3.6. Any advice or suggestion would be greatly appreciated. #importing numpy and pandas, seaborn import numpy as np #linear algebra import pandas as pd #datapreprocessing, CSV file I/O import seaborn as sns #for plotting graphs from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt data = pd.read_csv('kc_house_data.csv') data = data.drop('date',axis=1) data = data.drop('id',axis=1) X = data Y = X['price'].values X = X.drop('price', axis = 1).values X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.30, random_state=21) reg = LinearRegression() kfold = KFold(n_splits=15, random_state=21) cv_results = cross_val_score(reg, X_train, Y_train, cv=kfold, scoring='r2') print(cv_results) round(np.mean(cv_results)*100, 2)
This is the code from sklearn: https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
You can use matplotlib for plotting import matplotlib.pyplot as plt plt.figure(figsize=(16, 9)) plt.plot(cv_results) plt.show() There can be multiple type of plots you can use like simple line plot or scatter plot. plt.barh(x, y) # for bar graph plt.plot(x,y) # for line graph plt.scatter(x,y) # for scatter graph
Seaborn is a very useful visualization library. So much so that you can use 'seaborn.regplot' to directly plot the data and regression-model-fit line. It directly takes in the predictor variable and response variable, and spits out the plot of data points and best fit line. Here is the link on how to use it: https://seaborn.pydata.org/generated/seaborn.regplot.html
I have also done the same competition on kaggle. For regressions I would go for a scatter plot: import matplotlib as plt plt.plot(x,y) As for the visualisations on that particular competition I would use the following code: # visualising some more outliers in the data values fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 120)) plt.subplots_adjust(right=2) plt.subplots_adjust(top=2) sns.color_palette("husl", 8) for i, feature in enumerate(list(train[numeric]), 1): if(feature=='MiscVal'): break plt.subplot(len(list(numeric)), 3, i) sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=train) plt.xlabel('{}'.format(feature), size=15,labelpad=12.5) plt.ylabel('SalePrice', size=15, labelpad=12.5) for j in range(2): plt.tick_params(axis='x', labelsize=12) plt.tick_params(axis='y', labelsize=12) plt.legend(loc='best', prop={'size': 10}) plt.show() I have actually uploaded the full code for that competition on my GitHub if you want to have a look ;) (I am currently in the top 14% on that competition).
Plot RidgeCV coefficients as a function of the regularization
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import RidgeCV tips = sns.load_dataset('tips') X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time']) y = tips['tip'] alphas = 10**np.linspace(10,-2,100)*0.5 ridge_clf = RidgeCV(alphas=alphas,scoring='r2').fit(X, y) ridge_clf.score(X, y) I wanted to plot the following graph for RidgeCV. I don't see any option to do that like GridSearhCV. I appreciate your suggestions!
There is no indication what the colors stand for. I assume they stand for features and we investigate the size of each feature weight as function of alpha. Here is my solution: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import RidgeCV tips = sns.load_dataset('tips') X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time']) y = tips['tip'] alphas = 10**np.linspace(10,-2,100)*0.5 w = list() for a in alphas: ridge_clf = RidgeCV(alphas=[a],cv=10).fit(X, y) w.append(ridge_clf.coef_) w = np.array(w) plt.semilogx(alphas,w) plt.title('Ridge coefficients as function of the regularization') plt.xlabel('alpha') plt.ylabel('weights') plt.legend(X.keys()) Output: Since you only have two features in X there are only two lines.
Here is the code for generating the plot that you had posted. Firstly, we need to understand that RidgeCV would not return the coef for each alpha value that we had fed in the alphas param. The motivation behind having the RidgeCV is that it will try for different alpha values mentioned in alphas param, then based on cross validation scoring, it will return the best alpha along with the fitted model. Hence, the only way to get the coef for each alpha value using cv is iterate through RidgeCV using each alpha value. Example: # Author: Fabian Pedregosa -- <fabian.pedregosa#inria.fr> # License: BSD 3 clause print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn import linear_model # X is the 10x10 Hilbert matrix X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis]) y = np.ones(10) # ############################################################################# # Compute paths n_alphas = 200 alphas = np.logspace(-10, -2, n_alphas) coefs = [] for a in alphas: ridge = linear_model.RidgeCV(alphas=[a], fit_intercept=False, cv=3) ridge.fit(X, y) coefs.append(ridge.coef_) # ############################################################################# # Display results ax = plt.gca() ax.plot(alphas, coefs) ax.set_xscale('log') ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis plt.xlabel('alpha') plt.ylabel('weights') plt.title('RidgeCV coefficients as a function of the regularization') plt.axis('tight') plt.show()
Multiple traces on Polynomial Regression Graph
i am implementing simple polynomial regression to predict time for a video given its size, and it's my own dataset. Now for some reason, i am getting multiple traces for my plot. # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('estSize.csv') X = dataset.iloc[:, 0].values.reshape(-1,1) y = dataset.iloc[:, 1].values.reshape(-1,1) from sklearn.linear_model import LinearRegression # Fitting Polynomial Regression to the dataset from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree = 2) X_poly = poly_reg.fit_transform(X) poly_reg.fit(X_poly, y) lin_reg_2 = LinearRegression() lin_reg_2.fit(X_poly, y) # Visualising the Polynomial Regression results plt.scatter(X, y, color = 'red') plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue') plt.show()
Your data needs to be ordered with respect to the predictor. After the line dataset = pd.read_csv('estSize.csv') Add this line: dataset = dataset.sort_values(by=['col1']) Where col1 is your column header for the file-size values.
Fuzzy clustering on Python with Iris dataset
I am working on fuzzy c-means clustering of iris dataset, however can not visualize due to some errors.Using this tutorial I wrote the following for the iris, however it shows error called "AttributeError: shape". This is my code: from sklearn import datasets from sklearn.cluster import KMeans import pandas as pd import numpy as np import matplotlib.pyplot as plt import sklearn.metrics as sm import skfuzzy as fuzz iris = datasets.load_iris() x = pd.DataFrame(iris.data, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']) y = pd.DataFrame(iris.target, columns=['Target']) plt.figure(figsize=(6, 3)) model =fuzz.cluster.cmeans(iris,3,2,error=0.005,maxiter=1000,init=None,seed=None) model.fit(x) plt.show() I assumed that passing the parameter in the variable model would be enough, however it shows above error. If possible, could you show where I make a mistake ? How to fix this ? I really appreciate your help!
I tried pre-processing the data first, I created a good plot, I simply followed the tutorials, and I perform SVD to reduce the dimension into two, then I started to plot, it seems that for the tutorials you only need two dimensions (x,y). You do not need to do the model.fit() I have not found this kind of command in the documentation, here is the code: import numpy as np, pandas as pd, os import matplotlib import matplotlib.pyplot as plt import itertools from sklearn.metrics import confusion_matrix import statsmodels.api as sm import statsmodels.formula.api as smf from sklearn.preprocessing import StandardScaler from sklearn.decomposition import [![TruncatedSVD from skle][1]][1]arn.preprocessing import Normalizer import skfuzzy as fuzz from sklearn import datasets ################################################################################ iris = datasets.load_iris() x = pd.DataFrame(iris.data, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']) y = pd.DataFrame(iris.target, columns=['Target']) scaler = StandardScaler() X_std = scaler.fit_transform(x) lsa = TruncatedSVD(2, algorithm = 'arpack') dtm_lsa = lsa.fit_transform(X_std) dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa) a= pd.DataFrame(dtm_lsa, columns = ["component_1","component_2"]) a['targets']=y fig1, axes1 = plt.subplots(3, 3, figsize=(8, 8)) alldata = np.vstack((a['component_1'], a['component_2'])) fpcs = [] colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen'] for ncenters, ax in enumerate(axes1.reshape(-1), 2): cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans( alldata, ncenters, 2, error=0.005, maxiter=1000, init=None) # Store fpc values for later plots fpcs.append(fpc) # Plot assigned clusters, for each data point in training set cluster_membership = np.argmax(u, axis=0) for j in range(ncenters): ax.plot(a['component_1'][cluster_membership == j], a['component_2'][cluster_membership == j], '.', color=colors[j]) # Mark the center of each fuzzy cluster for pt in cntr: ax.plot(pt[0], pt[1], 'rs') ax.set_title('Centers = {0}; FPC = {1:.2f}'.format(ncenters, fpc)) ax.axis('off') fig1.tight_layout() fig1.savefig('iris_dataset.png')