Incompatible dimension for X and Y matrices - python

I was wondering what i have wrong here i get the error
Traceback (most recent call last):
File "main.py", line 37, in <module>
y_pred = knn.predict(X_test)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/neighbors/classification.py", line149, in predict
neigh_dist, neigh_ind = self.kneighbors(X)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/neighbors/base.py", line 434, in kneighbors
**kwds))
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1448, in pairwise_distances_chunked
n_jobs=n_jobs, **kwds)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1588, in pairwise_distances
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 1206, in _parallel_pairwise
return func(X, Y, **kwds)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 232, ineuclidean_distances
X, Y = check_pairwise_arrays(X, Y)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/metrics/pairwise.py", line 125, incheck_pairwise_arrays
X.shape[1], Y.shape[1]))
ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 38 while Y.shape[1] == 43
I'm new to ai and cant find anything on the internet that really solves this problem, any comment appreciated. This is my code
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
fileName = "breast-cancer-fixed.csv";
df = pd.read_csv(fileName)
X = df[df.columns[:-1]]
y = df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
X_train = OneHotEncoder().fit_transform(X_train)
X_test = OneHotEncoder().fit_transform(X_test)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("kNN model accuracy:", metrics.accuracy_score(y_test, y_pred))
My csv is massive and i cant upload it here so i put a small snippet in
age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
50-59,premeno,25-29,3-5,no,2,right,left_up,yes,no-recurrence-events
50-59,ge40,40-44,0-2,no,3,left,left_up,no,no-recurrence-events
40-49,premeno,10-14,0-2,no,2,left,left_up,no,no-recurrence-events
40-49,premeno,0-4,0-2,no,2,right,right_low,no,no-recurrence-events
40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,no-recurrence-events
50-59,premeno,25-29,0-2,no,2,left,left_low,no,no-recurrence-events
60-69,ge40,15-19,0-2,no,2,right,left_up,no,no-recurrence-events
Also if i get rid of the last two line of code ( the prediction code ) it runs fine with no errors

trying adding this line anywhere above the transforms
enc = OneHotEncoder(handle_unknown='ignore')
then change the transform lines to the following
enc = enc.fit(X_train)
X_train = enc.transform(X_train)
X_test = enc.transform(X_test)

I get this error
```Traceback (most recent call last):
File "main.py", line 25, in <module>
X_test = OneHotEncoder().transform(X_test)
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py", line 726, in transform
check_is_fitted(self, 'categories_')
File "/home/runner/.local/share/virtualenvs/python3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 914, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.exceptions.NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.```

Related

How can I use a dataframe of multi-value in each cell as an input to machine learning for classification

I build a data frame with multivalued in each cell as picture below
and I want to use logistic regression for classification>>>>
my code is :
fds1 = pd.DataFrame(featuresdata)
fds1.fillna('', inplace=True)
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(fds1, y, test_size=0.30, random_state=100)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy for logistic regression:", score)
but there was an error with this code:
File "C:\Users\hp\PycharmProjects\pythonProject\FE2.py", line 317, in CLS2butclick
classifier.fit(X_train, y_train)
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1138, in fit
X, y = self._validate_data(
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\base.py", line 596, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y
X = check_array(
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
array = np.asarray(array, order=order, dtype=dtype)
File "C:\Users\hp\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\core\generic.py", line 2064, in __array__
return np.asarray(self._values, dtype=dtype)
ValueError: setting an array element with a sequence.
How to fix that?
You need to do a label encoding before the training and convert string values to make them understandable for machine.
Refer to https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

Error `` `MultiLabelBinarizer``` when importing strings from a csv to a fit () function to train a model with scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('coords.csv',sep=';') #Cargo el archivo csv
x = df.iloc[1:,1:] #features values
y = df.iloc[1:,0] #target value
y = y.apply(lambda y: y.encode())
print(x)
print(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1234)
print(x_train)
print(y_train)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
pipelines = {
'lr':make_pipeline(StandardScaler(), LogisticRegression()),
'rc':make_pipeline(StandardScaler(), RidgeClassifier()),
'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}
fit_models = {}
for algo, pipeline in pipelines.items():
model = pipeline.fit(x_train, y_train)
fit_models[algo] = model
print(fit_models)
print(fit_models['lr'].predict(x_test))
print(fit_models['rc'].predict(x_test))
print(fit_models['rf'].predict(x_test))
print(fit_models['gb'].predict(x_test))
I was having a problem when trying to load strings from a csv file, because it tells me:
Traceback (most recent call last):
File "3_Train_Custom_Model_Using_Scikit_Learn.py", line 99, in <module>
model = pipeline.fit(x_train, y_train)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\utils\optimize.py", line 243, in _check_optimize_result
).format(solver, result.status, result.message.decode("latin1"))
AttributeError: 'str' object has no attribute 'decode'
And when I add y = y.apply (lambda y: y.encode ()) because I thought I needed to transform strings to bytes, I get this:
Traceback (most recent call last):
File "3_Train_Custom_Model_Using_Scikit_Learn.py", line 99, in <module>
model = pipeline.fit(x_train, y_train)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 335, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1345, in fit
check_classification_targets(y)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 169, in check_classification_targets
y_type = type_of_target(y)
File "C:\Users\PC0\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 263, in type_of_target
raise ValueError('You appear to be using a legacy multi-label data'
ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.
How do I so that the data framed in red from the csv that you see in the following Excel screenshot, which would be the targets, are saved in the variable y, and those that are framed in blue that It would be the features (x1, y1, z1, v1, x2, y2, z2, v2, ..., x501, y501, z501, v501) that must be saved in the variable x.
Try this:
df = pd.read_csv('testing.csv',sep=';',header=1)
x = df.iloc[:,1:] #features values
y = df.iloc[:,0] #target value
#y = y.apply(lambda y: y.encode())
print(x)
print(y)
...

python pipeline does not execute imputer

I analysing the gapminder dataset [1] using a pipeline in Python but for some reason the imputer does not replace the nan values. According to the documentation ("For missing values encoded as np.nan, use the string value “NaN”.") I should do it like below but the code crashes with "ValueError: Input contains NaN" in the line "gm_cv.fit(X_train, y_train)". But gm_cv was created based on the pipeline and the pipeline contains the imputation which should remove the nans. Why does this not work?
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
fn = 'gapminder.csv'
df = pd.read_csv(fn, delimiter=',')
# replace empty strings with numpy nans
df.replace('', np.nan, inplace=True)
df.replace(' ', np.nan, inplace=True)
targetVariable = 'lifeexpectancy'
X = df.drop([targetVariable, 'country'], axis=1).values
y = df[targetVariable]
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
('scaler', StandardScaler()),
('elasticnet', ElasticNet())]
pipeline = Pipeline(steps)
# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio': np.linspace(0,1,30)}
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4, random_state=98)
# Create the GridSearchCrossValidation object
gm_cv = GridSearchCV(pipeline, parameters, cv=3)
# Fit to the training set
gm_cv.fit(X_train, y_train)
# results:
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
stack trace of full error:
python.exe pipline_and_classification_II.py
Traceback (most recent call last):
File "pipline_and_classification_II.py", line 55, in <module>
gm_cv.fit(X_train, y_train)
File "lib\site-packages\sklearn\model_selection\_search.py", line 639, in fit
cv.split(X, y, groups)))
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "lib\site-packages\sklearn\model_selection\_validation.py", line 458, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "lib\site-packages\sklearn\pipeline.py", line 250, in fit
self._final_estimator.fit(Xt, y, **fit_params)
File "lib\site-packages\sklearn\linear_model\coordinate_descent.py", line 709, in fit
ensure_2d=False)
File "lib\site-packages\sklearn\utils\validation.py", line 453, in check_array
_assert_all_finite(array)
File "lib\site-packages\sklearn\utils\validation.py", line 44, in _assert_all_finite
" or a value too large for %r." % X.dtype)
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Process finished with exit code 1
Update:
Debugging it slowly (without the pipeline) shows that the Imputer does not like 1d arrays (like y in my code above). When doing the nan-removing manually before with the code below it works.
y = np.array(y)
idx = np.argwhere(np.isnan(y))
y[idx] = np.nanmean(y)
But this defeats the purpose of the pipeline. Any ideas how to get this running without manual tinkering?
[1] http://makemeanalyst.com/download-and-learn-about-gapminder-dataset/

Editing entire row in .csv using Pandas and feeding to KNeighborsClassifier

I am new to applied machine learning and there is this dataset which includes a column of percentage of cocoa present in a chocolate. But when I feed that column to fit() function of the KNeighborsClassifer it throws the following error;
Traceback (most recent call last):
File "/home/himanshu/ML Tut-2/ML_tut2.py", line 13, in <module>
knn.fit(X_train, y_train)
File "/usr/local/lib/python2.7/dist-packages/sklearn/neighbors/base.py", line 765, in fit
X, y = check_X_y(X, y, "csr", multi_output=True)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 542, in check_X_y
ensure_min_features, warn_on_dtype, estimator)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 402, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
ValueError: invalid literal for float(): 72%
My code is this;
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
choco = pd.read_csv('flavors_of_cacao.csv')
X = choco['Cocoa']
y = choco['Name']
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=0)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
It is clear that the fit() function needs a float number in the cocoa column but it is getting '%' symbol along with the number which cannot be converted to float without manipulations.
Please help me to fix this problem.
EDIT:
I have edited my CSV and removed the '%' signs from it, but now I am getting the following error;
Traceback (most recent call last):
File "/home/himanshu/ML Tut-2/ML_tut2.py", line 14, in <module>
knn.fit(X_train, y_train)
File "/usr/local/lib/python2.7/dist-packages/sklearn/neighbors/base.py", line 765, in fit
X, y = check_X_y(X, y, "csr", multi_output=True)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 552, in check_X_y
check_consistent_length(X, y)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py", line 173, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [1346, 449]
The new code is;
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
df = pd.read_csv('chocos.csv')
X = df[['Cocoa']]
y = df['Name']
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=0)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
For reference, the dataset is this.
Just use the values from that column without the percent symbol:
X = [[float(val.replace('%',''))] for val in choco['Cocoa']]

MLP classification fitting

I'm new to Machine Learning and I'm working on a python application that classifies poker hands using a dataset which I will post snippets. It does not seem to work well. And I am getting the following error:
Traceback (most recent call last):
File "C:\Users\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-62-0d21cd839ce4>", line 1, in <module>
mlp.fit(X_test, y_train.values.reshape(len(y_train), 1))
File "C:\Users\Anaconda3\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py", line 618, in fit
return self._fit(X, y, incremental=False)
File "C:\Users\Anaconda3\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py", line 330, in _fit
X, y = self._validate_input(X, y, incremental)
File "C:\Users\Anaconda3\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py", line 902, in _validate_input
multi_output=True)
File "C:\Users\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 531, in check_X_y
check_consistent_length(X, y)
File "C:\Users\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 181, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [6253, 18757]
here is the code I am trying to produce:
import pandas as pnd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
training_data = pnd.read_csv("train.csv")
training_data['id'] = range(1, len(training_data) + 1) # For 1-base index
training_datafile = training_data
target = training_datafile['hand']
data = training_datafile.drop(['id', 'hand'], axis=1)
X = data
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape
y_train.shape
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100))
mlp.fit(X_test, y_train.values.reshape(len(y_train), 1))
predictions = mlp.predict(X_test)
len(mlp.coefs_)
len(mlp.coefs_[0])
len(mlp.intercepts_[0])
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
The shape of X_train.shape is (18757, 10) and the shape of y_train.shape is (18757,)
I have tried using following previous post
y_train.values.reshape(len(y_train), 1)
But I still get the same error. Some guidance would be of much help since I am not sure of what the shape has wrong.
Data snippet:
You are fiting X_test instead of X_train.
mlp.fit(X_train, y_train.values.reshape(len(y_train), 1))

Categories

Resources