I am working with the yeast dataset available at:
http://archive.ics.uci.edu/ml/datasets/yeast
and I want to make a neural network classifier model and plot the learning curves. So, I have used the model_selection of scikit twice; one for making the training and testing set and once more for selecting the validation set. From these two sets I would like to plot the learning curves, my code is the following:
import numpy as np
import pandas as pd
from sklearn import model_selection, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
def readFile(file):
head=["seq_n","mcg","gvh","alm","mit","erl","pox","vac","nuc","site"]
f=pd.read_csv(file,delimiter=r"\s+")
f.columns=head
return f
def NeuralClass(X,y):
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2)
X_tr,X_val,y_tr,y_val=model_selection.train_test_split(X_train,y_train,test_size=0.2)
mlp=MLPClassifier(activation="relu",max_iter=3000)
mlp.fit(X_train,y_train)
print (mlp.score(X_train,y_train))
plt.plot(mlp.loss_curve_)
mlp.fit(X_val,y_val)
plt.plot(mlp.loss_curve_)
def main():
f=readFile("yeast.data")
list=["seq_n","site"]
X=f.drop(list,1)
y=f["site"]
NeuralClass(X,y)
if __name__=="__main__":
main()
I have obtained a graph like the following which I do not know if it´s correct:
The question is if this would be the correct way to plot the validation curve or if the method I followed is the right one.
Thanks
Didn't test it, but should be something like this:
def NeuralClass(X,y):
X_train,X_test,y_train,y_test = model_selection.train_test_split(
X,y,test_size=0.2)
mlp=MLPClassifier(
activation="relu",
max_iter=3000,
validation_fraction=0.2,
early_stopping=True)
mlp.fit(X_train,y_train)
print (mlp.score(X_train,y_train))
plt.plot(mlp.loss_curve_)
plt.plot(mlp.validation_scores_)
Related
I'm trying to perform a model selection between KNN and Logistic Regression using sampling technique as 10-fold cross validation, but keep getting the above error after the last part. Could I please be advised on what I'm doing wrong? Thanks.
Here is my code:
import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import cross_val_score
#Load data
mtcars_df = pd.read_csv('mtcars.csv')
mtcars_df.head()
X = mtcars_df.iloc[:,1:].values
Y = mtcars_df['model'].values
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
print(cross_val_score(knn,X,Y, cv=10, scoring='accuracy').mean())
I am trying to practice using Sci-Kit Learn to do a K-Nearest Neighbor prediction model using the Iris data set. This is what I have written:
import sklearn
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
iris = datasets.load_iris()
X = iris.data
y = iris.target
knn =KNeighborsClassifier(n_neighbors=6)
knn.fit(X, y)
This is my output>>> KNeighborsClassifier(n_neighbors=6)
However, I think I should be getting: KNeighborsClassifer(algorithm = 'auto', leaf_size =30, metric ='minkowski, metric_params=None, n_jobs=1, n_neighbors=6, p=2, weights='uniform')
Also, I tried to predict the target value based on a new array of X values (X_new) as below:
X_new = np.array([[5.6,2.8,3.9,1.1],[5.7,2.6,3.8,1.3],[4.7,3.2,1.3,0.2]])
Pred = knn.predict(X_new)
print(Pred)
However, it didn't provide an output of anything at all. Any assistance/advice would be appreciated!
I think your code works fine considering I ran it on Google Colab (link to the notebook - https://colab.research.google.com/drive/1FROuNe4NMD6D2HCCEtz6TePlCccbGFZm?usp=sharing).
Do check this out maybe you try reproducing the error.
I am a newbie in python.I am trying a simple prediction using linear regression.here is the dataset I am working:
serial Tt Tv Nc
90 71675 425595 3171
91 74865 441560 15965
Here is the code
#import library
import statsmodels.api as sm
from sklearn import datasets
import numpy as np
import pandas as pd
#dataset
data = pd.read_excel("testtv.xlsx")
#import linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#getting dependent variable and independent variable
x=data['Tt']
y=data['Tv']
#prediction
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1)
lr = LinearRegression()
lr.fit(np.array(x_train).reshape(-1,1),np.array(y_train).reshape(-1,1))
How can I print predicted value for next 7 'serial'?I have used the x values as 'Tt' and y values 'Tv'.Moreover, increasing test_size=0.8 gives wrong prediction value.How can I fix it?
I have been following tutorial on linear regression with scikitlearn.The code works perfectly and now i wanna predict new output by giving a new input.I have used student score and Study hour data set.
Here's the code:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
data=pd.read_csv("/home/crpsm/Pycharm/DataSet/student_scores.csv")
data.plot(x="Hours",y="Scores",style="o")
plt.title("Score-Hour")
plt.xlabel('Hours ')
plt.ylabel('Percentage ')
x=data.iloc[:,:-1]
y=data.iloc[:,1]
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.55,random_state=5)
regression_model=LinearRegression()
regression_model.fit(x_train,y_train)
print(regression_model.coef_)
print(regression_model.intercept_)
regression_model.predict(X_test)
prediction = regression_model.predict(X)
Please read the docs:
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
I'm trying to run a logistic regression. The data has been scrubbed and categorical variables change to dummies however when i run the code i get an error message from the "statsmodels" package outside of my code and not sure how to correct in this case.
A friend of mine ran the same code and he got an output (print screen below), as i'm using spyder with python 3.6 he thinks it might be a version issue - he is using python 3.5
I've got the code below. Any ideas on how to fix it or how better to run a logistic regression is appreciated.
error message i'm getting is in statsmodels library:
File "C:\Users\sebas\Anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py", line 2405, in llr_pvalue
return stats.chisqprob(self.llr, self.df_model)
AttributeError: module 'scipy.stats' has no attribute 'chisqprob'
thanks!
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
################################################################################
## Logistic regression
###############################################################################
data = pd.read_csv(r"log reg test Lending club 2007-2011 car only.csv")
#data = data.dropna()
print(data.shape)
##print(list(data.columns))
print(data['Distressed'].value_counts()) ## number of defaulted car loans is binary
sns.countplot(x='Distressed', data=data, palette='hls')
plt.show ## confrim dependent variable is binary
##basic numerical analysis of variables to check feasibility for model
## we will need to create dummy variables for strings
#print(data.groupby('Distressed').mean()) ##numerical variable means
#print(data.groupby('grade').mean()) ## string variable means
#print(data.groupby('sub_grade').mean())
#print(data.groupby('emp_length').mean())
#print(data.groupby('home_ownership').mean())
##testing for nulls in dataset
print(data.isnull().sum())
scrub_data=data.drop(['mths_since_last_delinq'],1) ## this variable is not statistically significant
print('Here is the logit model data')
print(scrub_data.isnull().sum()) ## removed records of missing info, sample still sufficiently large
print(list(scrub_data.columns))
print(scrub_data.head())
##convert categorical variables to dummies completed in csv file
X=scrub_data.ix[:,(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)].values
y=scrub_data.ix[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3, random_state=25)
LogReg=LogisticRegression()
LogReg.fit(X_train,y_train)
y_pred=LogReg.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
print('alternative method using RFE')
#y=['Distressed']
#x=[i for i in data if i not in y]
#print(y)
#print(x)
#print(data.info())
## check for independance between features
correlation=sns.heatmap(data.corr()) ## heatmap showing correlations of the variables
print(correlation)
from sklearn.svm import LinearSVC
#logreg = LogisticRegression()
#rfe = RFE(logreg,10)
#rfe=rfe.fit(x,y)
#print(rfe.support_)
#print(rfe.ranking_)
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary())
The error can be fixed by assigning the missing function back into the scipy.stats namespace as shown below:
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)