How to predict new output from a linearRegression model build using sklean?

How to predict new output from a linearRegression model build using sklean? - python

I have been following tutorial on linear regression with scikitlearn.The code works perfectly and now i wanna predict new output by giving a new input.I have used student score and Study hour data set.
Here's the code:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
data=pd.read_csv("/home/crpsm/Pycharm/DataSet/student_scores.csv")
data.plot(x="Hours",y="Scores",style="o")
plt.title("Score-Hour")
plt.xlabel('Hours ')
plt.ylabel('Percentage ')
x=data.iloc[:,:-1]
y=data.iloc[:,1]
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.55,random_state=5)
regression_model=LinearRegression()
regression_model.fit(x_train,y_train)
print(regression_model.coef_)
print(regression_model.intercept_)
regression_model.predict(X_test)

prediction = regression_model.predict(X)
Please read the docs:
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

Related

n_splits=10 cannot be greater than the number of members in each class

I'm trying to perform a model selection between KNN and Logistic Regression using sampling technique as 10-fold cross validation, but keep getting the above error after the last part. Could I please be advised on what I'm doing wrong? Thanks.
Here is my code:
import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import cross_val_score
#Load data
mtcars_df = pd.read_csv('mtcars.csv')
mtcars_df.head()
X = mtcars_df.iloc[:,1:].values
Y = mtcars_df['model'].values
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
print(cross_val_score(knn,X,Y, cv=10, scoring='accuracy').mean())

Using linear regression for prediction in python

I am a newbie in python.I am trying a simple prediction using linear regression.here is the dataset I am working:
serial Tt Tv Nc
90 71675 425595 3171
91 74865 441560 15965
Here is the code
#import library
import statsmodels.api as sm
from sklearn import datasets
import numpy as np
import pandas as pd
#dataset
data = pd.read_excel("testtv.xlsx")
#import linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#getting dependent variable and independent variable
x=data['Tt']
y=data['Tv']
#prediction
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1)
lr = LinearRegression()
lr.fit(np.array(x_train).reshape(-1,1),np.array(y_train).reshape(-1,1))
How can I print predicted value for next 7 'serial'?I have used the x values as 'Tt' and y values 'Tv'.Moreover, increasing test_size=0.8 gives wrong prediction value.How can I fix it?

plot training and validation loss curves?

I am working with the yeast dataset available at:
http://archive.ics.uci.edu/ml/datasets/yeast
and I want to make a neural network classifier model and plot the learning curves. So, I have used the model_selection of scikit twice; one for making the training and testing set and once more for selecting the validation set. From these two sets I would like to plot the learning curves, my code is the following:
import numpy as np
import pandas as pd
from sklearn import model_selection, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
def readFile(file):
head=["seq_n","mcg","gvh","alm","mit","erl","pox","vac","nuc","site"]
f=pd.read_csv(file,delimiter=r"\s+")
f.columns=head
return f
def NeuralClass(X,y):
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2)
X_tr,X_val,y_tr,y_val=model_selection.train_test_split(X_train,y_train,test_size=0.2)
mlp=MLPClassifier(activation="relu",max_iter=3000)
mlp.fit(X_train,y_train)
print (mlp.score(X_train,y_train))
plt.plot(mlp.loss_curve_)
mlp.fit(X_val,y_val)
plt.plot(mlp.loss_curve_)
def main():
f=readFile("yeast.data")
list=["seq_n","site"]
X=f.drop(list,1)
y=f["site"]
NeuralClass(X,y)
if __name__=="__main__":
main()
I have obtained a graph like the following which I do not know if it´s correct:
The question is if this would be the correct way to plot the validation curve or if the method I followed is the right one.
Thanks

Didn't test it, but should be something like this:
def NeuralClass(X,y):
X_train,X_test,y_train,y_test = model_selection.train_test_split(
X,y,test_size=0.2)
mlp=MLPClassifier(
activation="relu",
max_iter=3000,
validation_fraction=0.2,
early_stopping=True)
mlp.fit(X_train,y_train)
print (mlp.score(X_train,y_train))
plt.plot(mlp.loss_curve_)
plt.plot(mlp.validation_scores_)

Trying to run regression code. Getting error about 'linear_model'

I am trying to run this regression code.
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.cross_validation
# Load the data
oecd_bli = pd.read_csv("C:/Users/Excel/Desktop/Briefcase/PDFs/ALL PYTHON & R CODE SAMPLES/Hands-On Machine_Learning_with_Scikit_Learn_and_Tensorflow/GDP Per Capita/oecd_bli.csv", thousands=',')
gdp_per_capita = pd.read_csv("C:/Users/Excel/Desktop/Briefcase/PDFs/ALL PYTHON & R CODE SAMPLES/Hands-On Machine_Learning_with_Scikit_Learn_and_Tensorflow/GDP Per Capita/gdp_per_capita.csv",thousands=',')
# view first 10 rows of data frame
oecd_bli[:10]
gdp_per_capita[:10]
country_stats = pd.merge(oecd_bli, gdp_per_capita, left_index=True, right_index=True)
country_stats[:10]
X = np.c_[country_stats["GDP"]]
Y = np.c_[country_stats["VALUE"]]
print(X)
print(Y)
# Visualize the data
country_stats.plot(kind='scatter', x="GDP", y='VALUE')
plt.show()
# Select a linear model
lin_reg_model = sklearn.linear_model.LinearRegression()
# Train the model
lin_reg_model.fit(X, Y)
# Make a prediction for Cyprus
X_new = [[22587]] # Cyprus' GDP per capita
print(lin_reg_model.predict(X_new))
I get this error.
AttributeError: module 'sklearn' has no attribute 'linear_model'
I'm not sure what's going on. I am trying to learn about this from an example that I saw in a book.

#import package, call the class
from sklearn.linear_model import LinearRegression
#build the model(create a regression object)
model = LinearRegression()
#fit the model
model.fit(x,y)

linear_model is a subpackage of sklearn. It wont work if you only imported via: import sklearn. Try import sklearn.linear_model instead.

Python does not automatically import all the subpackages. When I tried to explicitly import, linear_module, it works:
from sklearn import linear_model

Unable to run logit model/ logistic regression

I'm trying to run a logistic regression. The data has been scrubbed and categorical variables change to dummies however when i run the code i get an error message from the "statsmodels" package outside of my code and not sure how to correct in this case.
A friend of mine ran the same code and he got an output (print screen below), as i'm using spyder with python 3.6 he thinks it might be a version issue - he is using python 3.5
I've got the code below. Any ideas on how to fix it or how better to run a logistic regression is appreciated.
error message i'm getting is in statsmodels library:
File "C:\Users\sebas\Anaconda3\lib\site-packages\statsmodels\discrete\discrete_model.py", line 2405, in llr_pvalue
return stats.chisqprob(self.llr, self.df_model)
AttributeError: module 'scipy.stats' has no attribute 'chisqprob'
thanks!
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
################################################################################
## Logistic regression
###############################################################################
data = pd.read_csv(r"log reg test Lending club 2007-2011 car only.csv")
#data = data.dropna()
print(data.shape)
##print(list(data.columns))
print(data['Distressed'].value_counts()) ## number of defaulted car loans is binary
sns.countplot(x='Distressed', data=data, palette='hls')
plt.show ## confrim dependent variable is binary
##basic numerical analysis of variables to check feasibility for model
## we will need to create dummy variables for strings
#print(data.groupby('Distressed').mean()) ##numerical variable means
#print(data.groupby('grade').mean()) ## string variable means
#print(data.groupby('sub_grade').mean())
#print(data.groupby('emp_length').mean())
#print(data.groupby('home_ownership').mean())
##testing for nulls in dataset
print(data.isnull().sum())
scrub_data=data.drop(['mths_since_last_delinq'],1) ## this variable is not statistically significant
print('Here is the logit model data')
print(scrub_data.isnull().sum()) ## removed records of missing info, sample still sufficiently large
print(list(scrub_data.columns))
print(scrub_data.head())
##convert categorical variables to dummies completed in csv file
X=scrub_data.ix[:,(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)].values
y=scrub_data.ix[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3, random_state=25)
LogReg=LogisticRegression()
LogReg.fit(X_train,y_train)
y_pred=LogReg.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
print('alternative method using RFE')
#y=['Distressed']
#x=[i for i in data if i not in y]
#print(y)
#print(x)
#print(data.info())
## check for independance between features
correlation=sns.heatmap(data.corr()) ## heatmap showing correlations of the variables
print(correlation)
from sklearn.svm import LinearSVC
#logreg = LogisticRegression()
#rfe = RFE(logreg,10)
#rfe=rfe.fit(x,y)
#print(rfe.support_)
#print(rfe.ranking_)
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary())

The error can be fixed by assigning the missing function back into the scipy.stats namespace as shown below:
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to predict new output from a linearRegression model build using sklean? - python

prediction = regression_model.predict(X) Please read the docs: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

Related

n_splits=10 cannot be greater than the number of members in each class

Using linear regression for prediction in python

plot training and validation loss curves?

Trying to run regression code. Getting error about 'linear_model'

Unable to run logit model/ logistic regression

Categories

Resources