How do I get rid of this error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("train.csv")
clean = {"Sex": {"male":1, "female":0}}
df.replace(clean, inplace = True)
df["label"] = df['Survived']
df = df.drop(["Name","Ticket","Cabin","Embarked","Fare","Parch","Survived"], axis = 1)
df = df.dropna(axis = 0, how="any")
X = df.drop(["label"],axis = 1).values
y = df["label"].values
X_train , y_train, X_test, y_test = train_test_split(X, y, test_size = 0.7)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print("Accuracy on test subset: (:.3f)".format(log_reg.score(X_train, y_train)))
ERROR
Traceback (most recent call last):
File "C:\Users\user\Documents\17\kaggle'\logistic.py", line 20, in <module>
log_reg.fit(X_train, y_train)
File "C:\Users\user\AppData\Local\Programs\Python\Python36-32\lib\site-packages\sklearn\linear_model\logistic.py", line 1216, in fit
order="C")
File "C:\Users\user\AppData\Local\Programs\Python\Python36-32\lib\site-packages\sklearn\utils\validation.py", line 547, in check_X_y
y = column_or_1d(y, warn=True)
File "C:\Users\user\AppData\Local\Programs\Python\Python36-32\lib\site-packages\sklearn\utils\validation.py", line 583, in column_or_1d
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (500, 5)
The error is due to this:
X_train , y_train, X_test, y_test = train_test_split(X, y, test_size = 0.7)
This is not what the train_test_split returns.
The actual usage should be:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7)
train_test_split will return the splitted arrays in order of supplied data. So X will be split into X_train, X_test and returned first, then y will be returned as y_train y_test.
Hope this helps.
Related
I am trying to run the following code
# Data Pre-processing Step
# importing libraries
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
# importing datasets
data_set = pd.read_csv('/Users/apple/Desktop/parkinsons.data')
# Extracting Independent and dependent Variable
x = data_set.iloc[:, [2, 3]].values
y = data_set.iloc[:, 4].values
# Splitting the dataset into training and test set.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
# feature Scaling
from sklearn.preprocessing import StandardScaler
st_x = StandardScaler()
x_train = st_x.fit_transform(x_train)
x_test = st_x.transform(x_test)
print(x_test)
from sklearn.svm import SVC # "Support vector classifier"
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(x_train, y_train)
It has problem with the line "classifier.fit(x_train, y_train)" as follows:-
Traceback (most recent call last):
File "/Users/apple/PycharmProjects/pythonProject4/main.py", line 30, in <module>
classifier.fit(x_train, y_train)
File "/Users/apple/PycharmProjects/pythonProject4/venv/lib/python3.10/site- packages/sklearn/svm/_base.py", line 201, in fit
y = self._validate_targets(y)
File "/Users/apple/PycharmProjects/pythonProject4/venv/lib/python3.10/site- packages/sklearn/svm/_base.py", line 745, in _validate_targets
check_classification_targets(y)
File "/Users/apple/PycharmProjects/pythonProject4/venv/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 207, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'
Process finished with exit code 1
What is wrong with my code ? Is it due to the version ?
I am using PyCharm with python 3.10
I am getting an error when trying to use statsmodels .predict to predict my test values.
Code:
X_train, X_test, y_train, y_test = train_test_split(X_new_np, y, test_size=0.2, random_state=42)
logit = sm.Logit(y_train, X_train)
reg = logit.fit_regularized(start_params=None, method='l1_cvxopt_cp', maxiter= 1000, full_output=1, disp=1, callback=None, alpha=.01, trim_mode='auto', auto_trim_tol=0.01, size_trim_tol=0.0001, qc_tol=0.03)
reg.summary()
y_pred_test = logit.predict(X_test)
Error:
ValueError: shapes (1000,61) and (251,61) not aligned: 61 (dim 1) != 251 (dim 0)
You simply don't predict from the right object. reg is the one that was fitted, you should then use reg.predict. The following code runs without error (I used your fit_regularized parameters).
from sklearn.model_selection import train_test_split
import numpy as np
from statsmodels.api import Logit
x = np.random.randn(100,50)
y = np.random.randint(0,2,100).astype(bool)
print(x.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2)
logit = Logit(y_train, X_train)
reg = logit.fit_regularized(start_params=None, method='l1_cvxopt_cp',
maxiter= 1000, full_output=1, disp=1, callback=None,
alpha=.01, trim_mode='auto', auto_trim_tol=0.01,
size_trim_tol=0.0001, qc_tol=0.03)
print(reg.summary())
y_pred_test = reg.predict(X_test)
from sklearn.model_selection import train_test_split
X = data.drop('Vickers Hardness\n(HV0.5)', axis=1)
y = data['Vickers Hardness\n(HV0.5)']
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size = 0.3)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
ValueError: y should be a 1d array, got an array of shape (3, 5) instead.
Used data:
How to rectify this error in naive bayes? how can I put y in 1D array?
The assignments of the train/test split are not ordered right, use:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
I simulated successfully my classification function to predict the single value of output binary by ANN utilizing pandas and sklearn libraries. Now I want to simulate my model to predict another feature which is not binary, as the input columns are (0,1,4,6,7,8,11,12,13,14) and the output column is (15) of my data set. A typical example of the input data is [4096,0.07324,1.7,20,5.2,64,0.142,0.5,35,30,584.232] as some values are float. How can I predict 584.232 by the first ten numbers utilizing logistic regression?
thank you all.
dataset = pd.read_csv("DataSet.csv")
X = dataset.iloc[:, [0,1,4,6,7,8,11,12,13,14]].values
y = dataset.iloc[:, 15].values
for avoiding type error, I converted the input values into float using the following way:
dataset['ColumnsName'] = dataset['ColumnsName'].astype(float)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelEncoder_X_delay_1 = LabelEncoder()
X[:, 1] = labelEncoder_X_1.fit_transform(X[:, 1])
labelEncoder_X_delay_2 = LabelEncoder()
X[:, 2] = labelEncoder_X_2.fit_transform(X[:, 2])
# normalizing the input
X = X.T
X = X / np.amax(X, axis=1)
X = X.T
# splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(X, y, test_size = 0.2, random_state = 0)
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# fitting logestic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
but after compiling the code up to now, it gives the error:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
Traceback (most recent call last):
File "<ipython-input-5-f18c8875152f>", line 3, in <module>
classifier.fit(X_train, y_train)
File "C:\Users\ali\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1528, in fit
check_classification_targets(y)
File "C:\Users\ali\anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 169, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'
I have already converted the predefined columns from string to float!
dataset = pd.read_csv("DataSet.csv")
X = dataset.iloc[:, [0,1,4,6,7,8,11,12,13,14]].values
y = dataset.iloc[:, 15].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelEncoder_X_delay_1 = LabelEncoder()
X[:, 1] = labelEncoder_X_1.fit_transform(X[:, 1])
labelEncoder_X_delay_2 = LabelEncoder()
X[:, 2] = labelEncoder_X_2.fit_transform(X[:, 2])
# normalizing the input
X = X.T
X = X / np.amax(X, axis=1)
X = X.T
# splitting the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Activation Function
model = Sequential()
model.add(Dense(6, input_dim=9, activation= "relu"))
model.add(Dense(6, activation= "relu"))
model.add(Dense(6, activation= "relu"))
model.add(Dense(1))
# splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(X, y, test_size = 0.2, random_state = 0)
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# fitting logestic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
I got dataframe to work with and when i test/split the data this error msg pops up
too many values to unpack (expected 2)
I just set my target columns Global as y value and rest of the columns as X for train_test_split. Not sure where to start fix this issues
X = df[['Year_of_Release', 'Critic_Score', 'Critic_Count',
'User_Score', 'User_Count', 'Platform_PC', 'Platform_PS3',
'Platform_PS4', 'Platform_Wii', 'Platform_X360',
'Platform_XOne', 'Genre_Action', 'Genre_Adventure', 'Genre_Fighting',
'Genre_Misc', 'Genre_Platform', 'Genre_Puzzle', 'Genre_Racing',
'Genre_Role-Playing', 'Genre_Shooter', 'Genre_Simulation',
'Genre_Sports', 'Genre_Strategy', 'Rating_E', 'Rating_E10+', 'Rating_M',
'Rating_RP', 'Rating_T']]
y = df[['Global']]
print(X.shape)
print(y.shape)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(X_train, train_size=0.8, test_size=0.2, random_state=42)
target = 'Global'
y_train = X_train[target]
y_val = X_val[target]
y_test = X_test[target]
X_train = X_train.drop(columns=target)
X_val = X_val.drop(columns=target)
X_test = X_test.drop(columns=target)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-75-d3fede999d7b> in <module>()
1 from sklearn.model_selection import train_test_split
2
----> 3 X_train, X_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)
4
5
ValueError: too many values to unpack (expected 2)
X = df[['Year_of_Release', 'Critic_Score', 'Critic_Count',
'User_Score', 'User_Count', 'Platform_PC', 'Platform_PS3',
'Platform_PS4', 'Platform_Wii', 'Platform_X360',
'Platform_XOne', 'Genre_Action', 'Genre_Adventure', 'Genre_Fighting',
'Genre_Misc', 'Genre_Platform', 'Genre_Puzzle', 'Genre_Racing',
'Genre_Role-Playing', 'Genre_Shooter', 'Genre_Simulation',
'Genre_Sports', 'Genre_Strategy', 'Rating_E', 'Rating_E10+', 'Rating_M',
'Rating_RP', 'Rating_T']]
y = df[['Global']]
print(X.shape)
print(y.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_test_split from sklearn returns 4 values X_Train and y_train along with X_test y_test.
Refer to the official documentation here.
Also you should specify either of the test size or train size .