I'm using scikit's logistic regression but I keep getting the message:
Found input variables with inconsistent numbers of samples: [90000, 5625]
In the code below, I've removed the columns with text in them and then I've split the date into a training and testing set.
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
dataset = pd.read_csv("/Users/An/Desktop/data/telco.csv", na_values = ' ')
dataset = dataset.dropna(axis = 0)
dataset = dataset.replace({'Yes':1, 'Fiber optic': 1, 'DSL':1, 'No':0, 'No phone service':0, 'No internet service':0})
dataset = dataset.drop('Contract', axis =1)
dataset = dataset.drop('PaymentMethod',axis =1)
dataset = dataset.drop('customerID',axis =1)
dataset = dataset.drop('gender',axis =1)
for i in list(['tenure', 'MonthlyCharges', 'TotalCharges']):
sd = np.std(dataset[i])
mean = np.mean(dataset[i])
dataset[i] = (dataset[i] - mean) / sd
total = pd.DataFrame(dataset)
data_train, data_test = train_test_split(total, test_size=0.2)
data_train = data_train.values
data_test = data_test.values
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1e9)
clf = clf.fit(data_train[:,0:16], data_train[:,16])
print clf.intercept_, clf.coef_
Could someone please explain what the error message means and help me figure out why I'm getting it?
In the second last line, data_train.reshape(-1, 1) is causing your problem. Removing reshape will do you a favor.
Reason
LogisticRegression.fit is expecting x and y to have same shape[0], but you are reshaping your x from (n, m) to (n*m, 1).
Here is the reproduced shapes:
import numpy as np
df = np.ndarray((2000,10))
x, y = df[:, 2:9], df[:, 9]
x.shape, y.shape # << what you should give to `clf.fit`
# ((2000, 7), (2000, ))
x.reshape(-1, 1).shape, y.shape # << what you ARE giving to `clf.fit`,
# ((14000, 1), (2000,)) # << which is causing the problem
Related
I have a battery dataframe with rows representing various cycles and a set of features for that cycle:
As an example row 1:
df = pd.DataFrame(columns=['Ecell_V', 'I_mA', 'EnergyCharge_W_h', 'QCharge_mA_h',
'EnergyDischarge_W_h', 'QDischarge_mA_h', 'Temperature__C',
'cycleNumber', 'SOH', 'Cell'])
df.loc[0] = [3.730646, 2988.8713, 0.185061, 49.724845, 0.0, 0.0, 27.5, 2, 0.99, 'VAH11']
There are 600,000 rows
I am trying to predict the value for SOH as follows:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
train_data = pd.read_csv("train_data.csv")
train_cell = train_data.pop('Cell')
# reduce size of df train for comp purposes
train_data = train_data.iloc[::20, :]
train_data = train_data.reset_index(drop=True)
#remove unwanted features
train_data.pop('Ns')
train_data.pop('time_s')
#scale the data
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data)
#return to df
train_data_scaled = pd.DataFrame(train_data_scaled, columns=['Ecell_V', 'I_mA', 'EnergyCharge_W_h', 'QCharge_mA_h',
'EnergyDischarge_W_h', 'QDischarge_mA_h', 'Temperature__C',
'cycleNumber', 'SOH'])
train_data_scaled
#unscale target
train_data_scaled['SOH'] = train_data['SOH']
train_data_scaled
#split target and input
X = train_data_scaled.drop('SOH', axis=1)
y = train_data_scaled['SOH'].values
#model
model = SVR(kernel='rbf', C=100, epsilon=1)
svr = model.fit(X, y)
#predict model
pred = model.predict(X)
Now returning ``` pred `` gives the same prediction for each row:
array([0.89976814, 0.89976814, 0.89976814, ..., 0.89976814, 0.89976814,
0.89976814])
why is this happening?
Using StandardScaler() on the X and y data corrected this issue, with an inverse called to return it to original values.
import numpy as np
import pandas as pd
import matplotlib
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
train = pd.read_csv("train_final.csv")
y = train['Y']
YValues = []
for x in range(len(y)):
YValues.append(y[x])
print(YValues)
print(type(YValues))
YVal = np.array(YValues)
train = train.drop(['Y'], axis=1)
test = pd.read_csv("test_final.csv")
dtrain = xgb.DMatrix(train, label = y)
dtest = xgb.DMatrix(test)
xgb2_hyperparams = XGBClassifier()
xgb2_hyperparams = xgb2.predict_proba(test)
xgb2_hyperparams_test = xgb2.predict_proba(train)
print('Accuracy: ', roc_auc_score(YVal, xgb2_hyperparams_test))
np.savetxt("xgboostHyperParams.csv", xgb2_hyperparams, delimiter=",")
print(xgb2_hyperparams)
I've explicitly created YVal to be a 1D np-array but it is still saying that YVal is an array of shape (2603, 2) and I'm not sure what is up with that. I originally tried fiddling with y but that led to more errors and at this point, I'm not sure why Python is so adamant about the (2603, 2) shape - I'm not sure what I'm missing that it is always reading it as (2603, 2) no matter whether it is data type series, ndarray or array.
You have to convert it to an array. Pandas essentially creates a container around your data. Try:
import numpy as np
import pandas as pd
import matplotlib
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
train = pd.read_csv("train_final.csv")
y = train['Y'].values
YValues = []
for x in range(len(y)):
YValues.append(y[x])
print(YValues)
print(type(YValues))
YVal = np.array(YValues)
train = train.drop(['Y'], axis=1).values
test = pd.read_csv("test_final.csv")
dtrain = xgb.DMatrix(train, label = y)
dtest = xgb.DMatrix(test)
xgb2_hyperparams = XGBClassifier()
xgb2_hyperparams = xgb2.predict_proba(test)
xgb2_hyperparams_test = xgb2.predict_proba(train)
print('Accuracy: ', roc_auc_score(YVal, xgb2_hyperparams_test))
np.savetxt("xgboostHyperParams.csv", xgb2_hyperparams, delimiter=",")
print(xgb2_hyperparams)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix
import pandas as pd
df=pd.read_csv('weather.csv',delimiter=',')
print(df)
x=df.values[:,0:df.shape[1]-1]
y=df.values[:,df.shape[1]-1]
x_train,y_train,x_test,y_test = train_test_split(x,y,test_size=0.5,random_state=0)
gnb=GaussianNB()
y_pred=gnb.fit(x_train,y_train).predict(x_test)
print(y_test,y_pred)
print("Number of misplaced points out of a total %d points : %d" % (x_test.shape[0],y_test!=y.pred).sum())
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred)
The above is my code which I tried in Google Colab. But here it shows one error :
"y should be a 1d array, got an array of shape {} instead.".format(shape)"
This is error is shown in the line
y_pred=gnb.fit(x_train,y_train).predict(x_test)
Please help me to solve this error. I am a beginner so answer the question with elaboration
Your problem is that the outputs of train_test_split are ordered differently than you think.
train_test_split returns the split of the first argument first, then the split of the second argument. So instead you should use it like
x_train, x_test, y_test, y_test = train_test_split(x,y,test_size=0.5,random_state=0)
You can find more information and a few examples in the documentation.
You can resolve issues like that by inspecting the shapes of the values of your variables. Either use a debugger or print their shapes:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
data = np.random.rand(100, 5) # some test data
df = pd.DataFrame(data)
x = df.values[:, :-1] # you probably don't want to include the last column here?
y = dfvalues[:, -1] # does the same as df.shape[1]-1
print(f"x shape: {x.shape}") # (100, 4)
print(f"y shape: {y.shape}") # (100,) ==> 1d, fine
x_train, y_train, x_test, y_test = train_test_split(x,y,test_size=0.5,random_state=0)
print(f"x_train shape: {x_train.shape}") # (50, 4)
print(f"y_train shape: {y_train.shape}") # (50, 4) ==> 2d, so something is wrong
print(f"x_test shape: {x_test.shape}") # (50,) => also bad
print(f"x_test shape: {y_test.shape}") # (50,) => also bad
gnb=GaussianNB()
y_pred=gnb.fit(x_train,y_train).predict(x_test) # error y should be 1d ...
Now you can see why the error is raised and you can see where things go wrong. Then you can lookup the documentation of the last command that produced unexpected outputs.
How can i solve this problem?
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv(r"G:\data_science\input\train.csv")
cat_columns = ['area_type','availability','location','size','society','bath','balcony']
for col in train.columns:
if col in cat_columns:
train[col]= train[col].astype('category')
train[col]= train[col].cat.codes
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
test = pd.read_csv(r"G:\data_science\input\test.csv")
y_train = train['price']
x_train = train.drop('price', axis = 1)
y_test = test['price']
x_test = test.drop('price',axis = 1)
model = LinearRegression()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
prediction
All your values are read as string from the csv file.
Now, somewhere you are trying to convert some values into float.
But the value that it has encountered is '2100 - 2850'.
Now you cannot convert this value into float and that is what the error is saying.
Please check the dataset once and resolve any such garbage value.
The error shows in my last two codes.
ValueError: Expected 2D array, got 1D array instead: array=[0 1].
Reshape your data either using array.reshape(-1, 1) if your data has a
single feature or array.reshape(1, -1) if it contains a single sample.
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
%matplotlib inline
df = pd.read_csv('.......csv')
df.drop(['Company'], 1, inplace=True)
x = pd.DataFrame(df.drop(['R&D Expense'],1))
y = pd.DataFrame(df['R&D Expense'])
X_test = x.index[[0,1]]
y_test = y.index[[0,1]]
X_train = x.drop(x.index[[0,1]])
y_train = y.drop(y.index[[0,1]])
from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
score = r2_score(y_true, y_predict)
return score
from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
def fit_model_shuffle(x, y):
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)
regressor = KNeighborsRegressor()
params = {'n_neighbors':range(3,10)}
scoring_fnc = make_scorer(performance_metric)
grid = GridSearchCV(regressor, param_grid=params,scoring=scoring_fnc,cv=cv_sets)
grid = grid.fit(x, y)
return grid.best_estimator_
reg = fit_model_shuffle(X_train, y_train)
> for i, y_predict in enumerate(reg.predict(X_test),1):
print(i, y_predict)
The error message is self-explanatory. Your library expects the input to be a 2D matrix, with one pattern per row. So, if you are doing regression with just one input, before passing it to the regressor, do
my_data = my_data.reshape(-1, 1)
to make a 2X1 shaped matrix
On the other hand (unlikely), if you have a single vector [0, 1]
my_data = my_data.reshape(1, -1)
to make a 1X2 matrix