I'm trying to use GridSearchCV to optimize the parameters for the classifier svm.SVC (both from sklearn).
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import numpy as np
X_train = np.array([[1,2],[3,4],[5,6],[2,3],[9,4],[4,5],[2,7],[1,0],[4,7],[2,9])
Y_train = np.array([0,1,0,1,0,0,1,1,0,1])
X_test = np.array([[2,4],[5,3],[7,1],[2,4],[6,4],[2,7],[9,2],[7,5],[1,6],[0,3]])
Y_test = np.array([1,0,0,0,1,0,1,1,0,0])
parameters = {'kernel':['rbf'],'C':np.linspace(10,100,10)}
clf1 = GridSearchCV(SVC(), parameters, verbose = 10)
clf1.fit(X_train, Y_train)
cm = confusion_matrix(Y_test, clf1.predict(X_test))
bp = clf1.best_params_
The output shows it completing GridSearchCV, but then it throws the error:
Traceback (most recent call last):
File "<ipython console>", line 1, in <module>
File "C:\Python27\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 479, in runfile
execfile(filename, namespace)
File "I:\setup\Desktop\Stats\FinalProject.py", line 112, in <module>
clf1 = GridSearchCV(SVC(), parameters, verbose = 10)
TypeError: 'dict' object is not callable
When I am running the code you posted:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import numpy as np
X_train = np.array([[1,2],[3,4],[5,6]])
Y_train = np.array([0,1,0])
X_test = np.array([[2,4],[5,3],[7,1]])
Y_test = np.array([1,0,0])
parameters = {'kernel':['rbf'],'C':np.linspace(10,100,10)}
clf1 = GridSearchCV(SVC(), parameters, verbose = 10)
clf1.fit(X_train, Y_train)
cm = confusion_matrix(Y_test, clf1.predict(X_test))
bp = clf1.best_params_
I'm getting this error:
File "C:\Anaconda\lib\site-packages\sklearn\svm\base.py", line 447, in _validate_targets
% len(cls))
ValueError: The number of classes has to be greater than one; got 1
Since the train data consist of 3 samples, when the GridSearchCV break the data into 3 folds (BTW you can control this parameter, it is called cv).
e.g. -
fold1 = [1,2] , label1 = 0
fold2 = [3,4] , label2 = 1
fold3 = [5,6] , label3 = 0
Now, in some iteration, it takes the first and the third folds to train on, and the second fold is used for validation.
Please note that these training folds contains only 1 type of label! (the label 0) hence the error it prints.
If I create the data in this manner:
X, Y = datasets.make_classification(n_samples=1000, n_features=4,
n_informative=2, n_redundant=2, n_classes=2)
X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X,Y,
test_size =0.2)
It runs just fine.
I guess you have some other problem, but regarding the code you entered - this is the error it has.
Related
I am trying to run the following code
# Data Pre-processing Step
# importing libraries
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
# importing datasets
data_set = pd.read_csv('/Users/apple/Desktop/parkinsons.data')
# Extracting Independent and dependent Variable
x = data_set.iloc[:, [2, 3]].values
y = data_set.iloc[:, 4].values
# Splitting the dataset into training and test set.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
# feature Scaling
from sklearn.preprocessing import StandardScaler
st_x = StandardScaler()
x_train = st_x.fit_transform(x_train)
x_test = st_x.transform(x_test)
print(x_test)
from sklearn.svm import SVC # "Support vector classifier"
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(x_train, y_train)
It has problem with the line "classifier.fit(x_train, y_train)" as follows:-
Traceback (most recent call last):
File "/Users/apple/PycharmProjects/pythonProject4/main.py", line 30, in <module>
classifier.fit(x_train, y_train)
File "/Users/apple/PycharmProjects/pythonProject4/venv/lib/python3.10/site- packages/sklearn/svm/_base.py", line 201, in fit
y = self._validate_targets(y)
File "/Users/apple/PycharmProjects/pythonProject4/venv/lib/python3.10/site- packages/sklearn/svm/_base.py", line 745, in _validate_targets
check_classification_targets(y)
File "/Users/apple/PycharmProjects/pythonProject4/venv/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 207, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'
Process finished with exit code 1
What is wrong with my code ? Is it due to the version ?
I am using PyCharm with python 3.10
from sklearn import datasets
import numpy as np
# Assigning the petal length and petal width of the 150 flower samples to Matrix X
# Class labels of the flower to vector y
iris = datasets.load_iris()
X = iris.data[:, [2, 3]]
y = iris.target
print('Class labels:', np.unique(y))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
print('Labels counts in y:', np.bincount(y))
print('Labels counts in y_train:', np.bincount(y_train))
print ('Labels counts in y_test:', np.bincount(y_test))
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
from sklearn.linear_model import Perceptron
ppn = Perceptron(n_iter=40, eta0=0.1, random_state=1)
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
When I run I get this error message:
Traceback (most recent call last):
File "c:/Users/Desfios 5/Desktop/Python/Ch3.py", line 27, in <module>
ppn = Perceptron(n_iter=40, eta0=0.1, random_state=1)
File "C:\Users\Desfios 5\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\validation.py", line 72, in inner_f
return f(**kwargs)
TypeError: __init__() got an unexpected keyword argument 'n_iter'
I've tried uninstalling and installing scikit-learn but that did not help. Any help?
I just change the n_iter to max_iter and it work for me
ppn = Perceptron(max_iter=40, eta0=0.3, random_state=0)
You receive this error
TypeError: init() got an unexpected keyword argument 'n_iter'
because the Perceptron has no parameter 'n_iter' you can use before fitting it.
You are trying to access the n_iter_ attribute, which is an "Estimated attribute" (you can tell by the underscore at the end) and only stored after the fit method has been called. Reference in Documentation
Before fitting, you can only access the n_iter_no_change parameter for n_iter.
I want to predict samples that can be in more than 1 label at a time (multi label classification). So I use the scikit-multilearn library and have successfully fitted a classifier, and can even predict test data. It just fails at outputting the accuracy of the classifier.
My data (up to 1100 rows):
The dependent vars (the vars I'm predicting) are the last 4: N/xN, Sex, Maturity, and CType. The rest are the independent vars.
The accuracy I'm talking about is how close the classifier is to predicting all the labels.
Here's the code:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from skmultilearn.problem_transform import BinaryRelevance
# Prepare data
df = pd.read_csv("Data_Numeric.csv")
# remove crab_id for now
del df['Crab_id']
# independent vars: the rest
# dependent vars: N/xN, Gender, Maturity, CType
# n_samples = 1100
# n_features = 6
# n_labels = 4
X = df.iloc[:, :6].values
y = df.iloc[:, 6:df.shape[1]].astype(np.int64).values
X = sparse.csr_matrix(X)
y = sparse.csr_matrix(y, dtype=np.int64)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# generate model
classifier = BinaryRelevance(SVC())
# train
classifier.fit(X_train, y_train)
# predict
y_pred = classifier.predict(X_test)
y_pred_array = y_pred.toarray()
# my_data = X_test[0:4, :]
# my_data[0] = [64.7, 46, 12, 13, 0, 0]
# my_data_prediction = classifier.predict(my_data).toarray()
# my_data_true = y_test[0:4, :].toarray()
# error here
score = accuracy_score(y_test.toarray(), y_pred.toarray())
The error is
Traceback (most recent call last):
File "<input>", line 42, in <module>
File "/home/f4ww4z/anaconda3/envs/ayah/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 185, in accuracy_score
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/home/f4ww4z/anaconda3/envs/ayah/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 97, in _check_targets
raise ValueError("{0} is not supported".format(y_type))
ValueError: multiclass-multioutput is not supported
y_test
>>> y_test
<330x4 sparse matrix of type '<class 'numpy.longlong'>'
with 578 stored elements in Compressed Sparse Row format>
y_test.toarray(), shape is 330x4:
y_pred
>>> y_pred
<330x4 sparse matrix of type '<class 'numpy.longlong'>'
with 408 stored elements in Compressed Sparse Column format>
y_pred.toarray():
How do I correctly see the accuracy of the classifier?
from sklearn.model_selection import cross_validate, KFold
clf = BinaryRelevance(SVC())
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(clf, X_train, y_train, cv=k_fold, scoring=['accuracy'])
OR
scores = cross_val_score(clf, X_train, y_train, cv=5)
By using cross validation methods you can obtain the 5 accuracy scores and then take a mean of them.
You can go by basic by using a MultioutputClassifier and RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, KFold
from sklearn.multioutput import MultiOutputClassifier
clf=MultiOutputClassifier(RandomForestClassifier(random_state=42,class_weight="balanced"))
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(clf, X_train_tf, y_train, cv=k_fold, scoring=['f1_weighted'])
Maybe this will help you :)
I am running a Gaussian regression in Python. My data set has the shape of (10000,5). But when I try to fit the model I get an error:
AttributeError: 'list' object has no attribute 'n_dims'
How do I resolve this?
I initially thought this error is being caused as the dimension of my dependent variable might be different from the independent variable. But even after changing them to the same dimension, I am unable to find the problem with the code. Any help will be much appreciated.
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
ExpSineSquared, DotProduct,
ConstantKernel)
data_set = pd.read_excel(r'XXXXX', sheet = 'Worksheet', header = 0)
data_set.head()
test_set = data_set
y = test_set.iloc[:,4]
test_set.drop(test_set.columns[4], axis = 1, inplace = True)
X = test_set
x=StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_train = np.reshape(y_train, (7000,1))
y_test = np.reshape(y_test, (3000,1))
kernels = [1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0))]
gp = GaussianProcessRegressor(kernel=kernels)
gp.fit(X_train, y_train)
File "<ipython-input-23-5a576449fdb6>", line 1, in <module>
gp.fit(X_train, y_train)
File "C:\Program Files\Anaconda\lib\site-packages\sklearn\gaussian_process\gpr.py", line 203, in fit
if self.optimizer is not None and self.kernel_.n_dims > 0:
AttributeError: 'list' object has no attribute 'n_dims'
When initializing the GaussianProcessRegressor(kernel=kernels) the argument passed as kernel has to be a kernel object. You are passing a list.
More information in the documentation here.
This is my first time posting here. For the past couple of days I have been trying to teach myself scikit-learn. But recently I have encountered an error that has been nagging me for quite some time.
My goal is simply to train a NB classifier cli so that I can feed it an arbitrary list of strings called new_doc and it will predict what class the string is likely to belong to.
This is what my program looks like:
#Importing stuff
import numpy as np
import pylab
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn import metrics
#Opening the csv file
df = pd.read_csv('data.csv', sep=',')
#Randomising the rows in the file
df = df.reindex(np.random.permutation(df.index))
#Extracting features from text, define target y and data X
vect = CountVectorizer()
X = vect.fit_transform(df['Features'])
y = df['Target']
#Partitioning the data into test and training set
SPLIT_PERC = 0.75
split_size = int(len(y)*SPLIT_PERC)
X_train = X[:split_size]
X_test = X[split_size:]
y_train = y[:split_size]
y_test = y[split_size:]
#Training the model
clf = MultinomialNB()
clf.fit(X_train, y_train)
#Evaluating the results
print "Accuracy on training set:"
print clf.score(X_train, y_train)
print "Accuracy on testing set:"
print clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print "Classification Report:"
print metrics.classification_report(y_test, y_pred)
#Predicting new data
new_doc = ["MacDonalds", "Walmart", "Target", "Starbucks"]
trans_doc = vect.transform(new_doc) #extracting features
y_pred = clf.predict(trans_doc) #predicting
But when I run the program I get the following error on the last row:
y_pred = clf.predict(trans_doc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 62, in predict
jll = self._joint_log_likelihood(X)
File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 441, in _joint_log_likelihood
return (safe_sparse_dot(X, self.feature_log_prob_.T)
File "/Library/Python/2.7/site-packages/sklearn/utils/extmath.py", line 175, in safe_sparse_dot
ret = a * b
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/sparse/base.py", line 334, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
So apparently it has something to do with the dimension of the term-document matrixes.
When I check the dimensions of trans_doc, X_train and X_test i get:
>>> trans_doc.shape
(4, 4)
>>> X_train.shape
(145314, 28750)
>>> X_test.shape
(48439, 28750)
In order for y_pred = clf.predict(trans_doc) to work I need to (from what I understand it) transform new_doc into a term-document matrix with the dimensions (4, 28750). But I don't know of any methods within CountVectorizer that lets me do this.