I was trying to make a program to predict the runs made by a cricketer. I used a csv file for data made by me. The code is:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
#Data
data = pd.read_csv('Rohit Sharma.csv')
X = [['against','wickets','currentrun','weather','ball','over']]
Y = ['runsmade']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, train_size=None, random_state=42)
reg = LinearRegression()
reg.fit(x_train,y_train)
a = reg.predict(x_test)
print(a)
print(data)
But it showed an error:
ValueError: With n_samples=1, test_size=0.33 and train_size=None, the resulting
train set will be empty. Adjust any of the aforementioned parameters
How to fix it?
Try this:
Looks like you made an error while selecting the columns of the data. See below.
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
#Data
data = pd.read_csv('Rohit Sharma.csv')
X = data[['against','wickets','currentrun','weather','ball','over']].to_numpy()
Y = data['runsmade'].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state=42)
reg = LinearRegression()
reg.fit(x_train,y_train)
a = reg.predict(x_test)
print(a)
print(data)
Related
Ideally I should get same result as score is nothing but R-Square. But not sure why results are coming different.
from sklearn.datasets import california_housing
data = california_housing.fetch_california_housing()
data.data.shape
data.feature_names
data.target_names
import pandas as pd
house_data = pd.DataFrame(data.data, columns=data.feature_names)
house_data.describe()
house_data['Price'] = data.target
X = house_data.iloc[:, 0:8].values
y = house_data.iloc[:, -1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
#Check R-square on training data
from sklearn.metrics import mean_squared_error, r2_score
y_pred = linear_model.predict(X_test)
print(linear_model.score(X_test, y_test))
print(r2_score(y_pred, y_test))
Output
0.5957643114594776
0.34460597952465033
from the docs: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
sklearn.metrics.r2_score(y_true, y_pred,...)
You are passing y_true and y_pred the wrong way around. If you switch them you get the correct result.
print(linear_model.score(X_test, y_test))
print(r2_score(y_test, y_pred))
0.5957643114594777
0.5957643114594777
Trying to do logisticregression with Dask but get the error
Could not find signature for add_intercept: <DataFrame>"
my code:
from dask_ml.linear_model import LogisticRegression
from dask_ml.model_selection import train_test_split
from dask_ml.feature_extraction.text import HashingVectorizer
from dask_ml.datasets import make_classification
dask_df = dd.from_pandas(df2,chunksize=50)
dask_df.head()
X_train, X_test, y_train, y_test = train_test_split(df2[features], df2[label], test_size = 0.20, random_state = 42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.decision_function(X_test)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x=np.array([0.1,0.2,0.7,8.0,45.0,56.0,66.0,0.7,0.6,64.0])
y=np.array([0,0,0,1,1,1,1,0,0,1])
x = np.array(x).reshape((1, -1))
y = np.array(y).reshape((1, -1))
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.4, train_size=0.5, random_state=7, stratify=y)
knn = KNeighborsClassifier()
knn.fit(y_train, x_train)
y_train_predict = knn.predict(x_train)
y_test_predict = knn.predict(x_test)
print(y_train_predict)
print(y_test_predict)
Error:
With n_samples=1, test_size=0.4 and train_size=0.5, the resulting train set will be empty. Adjust any of the aforementioned parameters.
Try:
x = np.array(x).reshape(-1, 1)
y = np.array(y).reshape(-1, 1)
I am trying to train a Linear Regression Qualifier to continue a grap.
I have a couple of thousand lines of data in my csv file that I import into numpy arrays. Here is my code :
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import csv
import math
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
def predict():
sample_data = pd.read_csv("includes\\csv.csv")
x = np.array(sample_data["day"])
y = np.array(sample_data["balance"])
for x in x:
x = x.reshape(1, -1)
#lol
for y in y:
y.reshape(1, -1)
#lol
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
clf = LinearRegression()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
When I run this, the error is:
TypeError: Singleton array 6014651 cannot be considered a valid collection.
Any ideas why that's a thing?
After discussion in comments:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import csv
import math
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
def predict():
sample_data = pd.read_csv("includes\\csv.csv")
x = np.array(sample_data["day"])
y = np.array(sample_data["balance"])
x = x.reshape(-1,1)
y = y.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
clf = LinearRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
X_train, X_test should be capitals, python variables are case sensitive
I am trying to use the "class_weight" parameter in scikit-learn for the binary svm.SVC classifier. Which I am basically trying to do is to vary precision in class 1 by changing class weights.
Unfortunately after weeks of trying, I am not able to achieve this goal, which makes me think, that there still might be inconsistencies in sklearn...
Here is my code mini-example:
import os
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
scaler = preprocessing.StandardScaler()
data = pd.read_csv("...", header=0, delimiter=";", quoting=3, low_memory=False)
def Train_Test_Split(test_size, dataframe, name_y, name_X):
X = dataframe.ix[:,name_X :]
y = dataframe[name_y]
y= np.asarray(y,dtype=int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
y_train = np.asarray(y_train,dtype=int)
y_test = np.asarray(y_test,dtype=int)
return(X_train, y_train, X_test, y_test)
def Score(y_test, y_pred):
a = confusion_matrix(y_test,y_pred, labels=[1,0])
Precision_stables = a[0][0]/(a[0][0]+a[1][0])
Precision_instables = a[1][1]/(a[1][1]+a[0][1])
return(Precision_stables, Precision_instables)
def Eval_svm(class_ponder,testsize, dataframe, name_y, name_X):
X_train, y_train, X_test, y_test = Train_Test_Split(testsize, dataframe, name_y, name_X)
clf_svm = svm.SVC(kernel='linear',class_weight=class_ponder,probability=True)
clf_svm_optimal = clf_svm.fit(X_train, y_train)
y_pred_svm = clf_svm_optimal.predict(X_test)
PRS_svm, PRI_svm = Score(y_test, y_pred_svm)
return(PRS_svm, PRI_svm)
name_y = "...variableofinterest..."
name_x = "...explanatoryvariables..."
a,b=Eval_svm({0: 100, 1: 1},0.3, data, name_y, name_x)
print(a,b)
I can choose whatever weighting I'd like, the precision in class 1 or even 0 doesn't change at all.
Could someone help me here? It's kind of exasperating...
Thank you very much in advance!
Best regards,
F