predict khmer with multinomialNB (scikit-learn) - python
I'm trying to make a classifier with scikit-learn on python, to predict if a nucleotide sequence of virus is potentially pathogenic for human. I identified some sequence as pathogenic with 0 and not with 1, sequence are differentiated from sequence to class with tabulation like this:
ATCGATCGAATCGGATC 1
ATCGGGGGATATATAAATTACATATATTGTTGTATG 1
ATCGTAT 0
ATAAATATTGTATTGCG 0
...
I essentially got my work from Krish Naik https://github.com/krishnaik06/DNA-Sequencing-Classifier/blob/master/DNA%20Sequencing%20and%20applying%20Classifier.ipynb where he tried to predict protein class. I simply modified to fit with my goal, but the problem is I don't find any solutions to predict the pathogenicity of a new sequence.
You can find the data I used on my gitlab https://gitlab.com/MasterBioinformaticBiostatistics/bioinformatic/virusid.git .
Here is, the code of Krish Naik I used with my data (this part seems to work as a model has been built):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Code modified from Krish Naik https://github.com/krishnaik06/DNA-Sequencing-Classifier/blob/master/DNA%20Sequencing%20and%20applying%20Classifier.ipynb
human_data = pd.read_table('final_petit.txt')
human_data.head()
from IPython.display import Image
Image("Capture1.PNG")
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
human_data['words'] = human_data.apply(lambda x: getKmers(x['sequence']), axis=1)
human_data = human_data.drop('sequence', axis=1)
human_data.head()
human_texts = list(human_data['words'])
for item in range(len(human_texts)):
human_texts[item] = ' '.join(human_texts[item])
y_data = human_data.iloc[:, 0].values
print(human_texts[2])
y_data
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
X = cv.fit_transform(human_texts)
print(X.shape)
human_data['class'].value_counts().sort_index().plot.bar()
# Splitting the human dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
y_data,
test_size = 0.20,
random_state=42)
print(X_train.shape)
print(X_test.shape)
### Multinomial Naive Bayes Classifier ###
# The alpha parameter was determined by grid search previously
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
accuracy = accuracy_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
f1 = f1_score(y_test, y_predicted, average='weighted')
return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
In order to predict the new sequence I decided to follow the same method, get and count the khmer words of my new sequence:
#NEW SEQUENCE
new_seq = pd.read_table('sequence.txt')
new_seq.head()
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
new_seq = new_seq.drop('sequence', axis=1)
new_seq.head()
new_seqtext = list(new_seq['words'])
for item in range(len(new_seqtext)):
new_seqtext[item] = ' '.join(new_seqtext[item])
y_data = new_seq.iloc[:, 0].values
y_data
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
U = cv.fit_transform(new_seqtext)
u_pred = classifier.predict(U)
But the prediction using classifier.predict seems to not work as wanted, even if the sequence has been cut an count in khmer.
Traceback (most recent call last):
File "Original.py", line 89, in <module>
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/frame.py", line 6878, in apply
return op.get_result()
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/apply.py", line 186, in get_result
return self.apply_standard()
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/apply.py", line 295, in apply_standard
result = libreduction.compute_reduction(
File "pandas/_libs/reduction.pyx", line 618, in pandas._libs.reduction.compute_reduction
File "pandas/_libs/reduction.pyx", line 128, in pandas._libs.reduction.Reducer.get_result
File "Original.py", line 89, in <lambda>
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 4419, in get_value
raise e1
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas/_libs/index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas/_libs/index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'sequence'
The program seems to crashed during this step:
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
Is my approach too naive to get khmer of a new sequence?
Solution by a teammate has been found :
Instead of using
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
U = cv.fit_transform(new_seqtext)
You may use this:
U = cv.transform(new_seqtext)
fit_transform is used for building the model as I understood so far, and the no need to import again and cv has always been declared...
Related
I want to use GridSearchCV to find best parameters and predict two methods
My code is below: import numpy as np import matplotlib.pyplot as plt from sklearn.svm import SVC from sklearn.linear_model import Lasso from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score from sklearn.datasets import make_moons n = 300 n_tr = 200 X, y = make_moons(n, shuffle=True, noise=0.2, random_state=112) y[y==0] = -1 # standardise the data trmean = np.mean(X[:n_tr, :], axis=0) trvar = np.var(X[:n_tr, :], axis=0) X = (X - trmean[np.newaxis, :]) / np.sqrt(trvar)[np.newaxis, :] # take first n_tr as training, others as test. Xtr = X[:n_tr, :] ytr = y[:n_tr] Xt = X[n_tr:, :] yt = y[n_tr:] # the parameters to consider in cross-validation # order lasso params from largest to smallest so that if ties occur CV will select the one with more sparsity params_lasso = [1, 1e-1, 1e-2, 1e-3, 1e-4] params_rbf = [1e-3, 1e-2, 1e-1, 1, 10, 100] params_svm = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] # smaller C: more regularisation # the noisy features to be considered; for n noisy features, add the first n columns n_tot_noisy_feats = 50 np.random.seed(92) X_noise = 2*np.random.randn(X.shape[0], n_tot_noisy_feats) Xtr_noise = X_noise[:n_tr, :] Xt_noise = X_noise[n_tr:, :] svm_accs=[] lasso_accs=[] lasso_n_feats=[] amount_of_noise_feature=np.arange(n_tot_noisy_feats+1) k=5 for n_nosie_feats in amount_of_noise_feature: print(n_nosie_feats) Xtr_noisy = np.copy(Xtr) Xt_nosiy = np.copy(Xt) if n_nosie_feats > 0: Xtr_noisy = np.hstack((Xtr_noisy, Xtr_noise[:,:n_nosie_feats])) Xt_nosiy = np.hstack((Xt_nosiy, Xt_noise[:,:n_nosie_feats])) lasso_valid_accs = np.zeros((k, len(params_lasso))) svm_valid_accs = np.zeros((k, len(params_rbf), len(params_svm))) from sklearn.model_selection import GridSearchCV cscv_lasso=GridSearchCV(Lasso(),{"alpha":params_lasso},scoring='accuracy',cv=KFold(5)) cscv_lasso.fit(Xtr_noisy,ytr) cscv_lassopreds=np.sign(cscv_lasso.best_estimator_.predict(Xt_nosiy)) cscv_svm = GridSearchCV(SVC(), {"C": params_svm,"gamma":params_rbf},scoring='accuracy', cv=KFold(5)) cscv_svm.fit(Xtr_noisy, ytr) Which results in this error message: UserWarning, /usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py:700: UserWarning: Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: Traceback (most recent call last): File "/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py", line 687, in _score scores = scorer(estimator, X_test, y_test) File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_scorer.py", line 200, in __call__ sample_weight=sample_weight) File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_scorer.py", line 243, in _score **self._kwargs) File "/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py", line 63, in inner_f return f(*args, **kwargs) File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score y_type, y_true, y_pred = _check_targets(y_true, y_pred) File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py", line 93, in _check_targets "and {1} targets".format(type_true, type_pred)) ValueError: Classification metrics can't handle a mix of binary and continuous targets What is the lowest amount of noisy features added to the original data, when Lasso accuracy on test set is better than SVM? It seems that the the predict is weird. best lasso parameters: {'alpha': 0.1} best svm params: {'C': 100, 'gamma': 0.001} I want to find the best param for Lasso and SVC, and the corresponding accuracy_score when noisy features added to the original data.
Cross validation inconsistent numbers of samples error (Python)
I am trying to make a classification using cross validation method and SVM classifier. In my data file, the last column contains my classes (which are 0, 1, 2, 3, 4, 5) and the rest (except first column) is the numeric data that I want to use to predict these classes. from sklearn import svm from sklearn import metrics import numpy as np from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_val_score filename = "Features.csv" dataset = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=range(1, 39)) x = dataset[:, 0:36] y = dataset[:, 36] print("len(x): " + str(len(x))) print("len(y): " + str(len(x))) skf = StratifiedKFold(n_splits=10, shuffle=False, random_state=42) modelsvm = svm.SVC() expected = y print("len(expected): " + str(len(expected))) predictedsvm = cross_val_score(modelsvm, x, y, cv=skf) print("len(predictedsvm): " + str(len(predictedsvm))) svm_results = metrics.classification_report(expected, predictedsvm) print(svm_results) And I am getting such an error: len(x): 2069 len(y): 2069 len(expected): 2069 C:\Python\Python37\lib\site-packages\sklearn\model_selection\_split.py:297: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. FutureWarning len(predictedsvm): 10 Traceback (most recent call last): File "C:/Users/MyComp/PycharmProjects/GG/AR.py", line 54, in <module> svm_results = metrics.classification_report(expected, predictedsvm) File "C:\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 73, in inner_f return f(**kwargs) File "C:\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 1929, in classification_report y_type, y_true, y_pred = _check_targets(y_true, y_pred) File "C:\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 81, in _check_targets check_consistent_length(y_true, y_pred) File "C:\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 257, in check_consistent_length " samples: %r" % [int(l) for l in lengths]) ValueError: Found input variables with inconsistent numbers of samples: [2069, 10] Process finished with exit code 1 I don't understand how my data count in y goes down to 10 when I am trying to predict it using CV. Can anyone help me on this please?
You are misunderstanding the output from cross_val_score. As per the documentation it returns "array of scores of the estimator for each run of the cross validation," not actual predictions. Because you have 10 folds, you get 10 values. classification_report expects the true values and the predicted values. To use this, you'll want to predict with a model. To do this, you'll need to fit the model on the data. If you're happy with the results from cross_val_score you can train that model on the data. Or, you can use GridSearchCV to do this all in one sweep.
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
I am trying to predict using SVM but I receive the error AttributeError: 'numpy.ndarray' object has no attribute 'lower' when executing line text_clf.fit(X_train,y_train) of my code. How to fix this and get the probability that my prediction is correct using SVM? I am predicting the first column (gold) of my input file based on the values of the remaining columns. My input file dataExtended.txtis under the form: gold,T-x-T,T-x-N,T-x-U,T-x-NT,T-x-UT,T-x-UN,T-x-UNT,N-x-T,N-x-N,N-x-U,N-x-NT,N-x-UT,N-x-UN,N-x-UNT,U-x-T,U-x-N,U-x-U,U-x-NT,U-x-UT,U-x-UN,U-x-UNT,NT-x-T,NT-x-N,NT-x-U,NT-x-NT,NT-x-UT,NT-x-UN,NT-x-UNT,UT-x-T,UT-x-N,UT-x-U,UT-x-NT,UT-x-UT,UT-x-UN,UT-x-UNT,UN-x-T,UN-x-N,UN-x-U,UN-x-NT,UN-x-UT,UN-x-UN,UN-x-UNT,UNT-x-T,UNT-x-N,UNT-x-U,UNT-x-NT,UNT-x-UT,UNT-x-UN,UNT-x-UNT,T-T-x,T-N-x,T-U-x,T-NT-x,T-UT-x,T-UN-x,T-UNT-x,N-T-x,N-N-x,N-U-x,N-NT-x,N-UT-x,N-UN-x,N-UNT-x,U-T-x,U-N-x,U-U-x,U-NT-x,U-UT-x,U-UN-x,U-UNT-x,NT-T-x,NT-N-x,NT-U-x,NT-NT-x,NT-UT-x,NT-UN-x,NT-UNT-x,UT-T-x,UT-N-x,UT-U-x,UT-NT-x,UT-UT-x,UT-UN-x,UT-UNT-x,UN-T-x,UN-N-x,UN-U-x,UN-NT-x,UN-UT-x,UN-UN-x,UN-UNT-x,UNT-T-x,UNT-N-x,UNT-U-x,UNT-NT-x,UNT-UT-x,UNT-UN-x,UNT-UNT-x,x-T-T,x-T-N,x-T-U,x-T-NT,x-T-UT,x-T-UN,x-T-UNT,x-N-T,x-N-N,x-N-U,x-N-NT,x-N-UT,x-N-UN,x-N-UNT,x-U-T,x-U-N,x-U-U,x-U-NT,x-U-UT,x-U-UN,x-U-UNT,x-NT-T,x-NT-N,x-NT-U,x-NT-NT,x-NT-UT,x-NT-UN,x-NT-UNT,x-UT-T,x-UT-N,x-UT-U,x-UT-NT,x-UT-UT,x-UT-UN,x-UT-UNT,x-UN-T,x-UN-N,x-UN-U,x-UN-NT,x-UN-UT,x-UN-UN,x-UN-UNT,x-UNT-T,x-UNT-N,x-UNT-U,x-UNT-NT,x-UNT-UT,x-UNT-UN,x-UNT-UNT,callersAtLeast1T,CalleesAtLeast1T,callersAllT,calleesAllT,CallersAtLeast1N,CalleesAtLeast1N,CallersAllN,CalleesAllN,childrenAtLeast1T,parentsAtLeast1T,childrenAtLeast1N,parentsAtLeast1N,childrenAllT,parentsAllT,childrenAllN,ParentsAllN,ParametersatLeast1T,FieldMethodsAtLeast1T,ReturnTypeAtLeast1T,ParametersAtLeast1N,FieldMethodsAtLeast1N,ReturnTypeN,ParametersAllT,FieldMethodsAllT,ParametersAllN,FieldMethodsAllN,ClassGoldN,ClassGoldT,Inner,Leaf,Root,Isolated,EmptyCallers,EmptyCallees,EmptyCallersCallers,EmptyCalleesCallees,Program,Requirement,MethodID T,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,6,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,7,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,8,1 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,3 N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,3 Here is my full reproducible code: # Make Predictions with Naive Bayes On The Iris Dataset from sklearn.cross_validation import train_test_split from sklearn import metrics import pandas as pd import numpy as np import seaborn as sns; sns.set() from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report import seaborn as sns from sklearn import svm from sklearn.svm import LinearSVC from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline data = pd.read_csv( 'dataExtended.txt', sep= ',') row_count, column_count = data.shape # Printing the dataswet shape print ("Dataset Length: ", len(data)) print ("Dataset Shape: ", data.shape) print("Number of columns ", column_count) # Printing the dataset obseravtions print ("Dataset: ",data.head()) data['gold'] = data['gold'].astype('category').cat.codes data['Program'] = data['Program'].astype('category').cat.codes # Building Phase Separating the target variable X = data.values[:, 1:column_count] Y = data.values[:, 0] # Splitting the dataset into train and test X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100) #Create a svm Classifier svclassifier = svm.LinearSVC() print('Before fitting') svclassifier.fit(X_train, y_train) predicted = svclassifier.predict(X_test) text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())]) text_clf.fit(X_train,y_train) Traceback leading to error: Traceback (most recent call last): File "<ipython-input-9-8e85a0a9f81c>", line 1, in <module> runfile('C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py', wdir='C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python') File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 668, in runfile execfile(filename, namespace) File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile exec(compile(f.read(), filename, 'exec'), namespace) File "C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py", line 53, in <module> text_clf.fit(X_train,y_train) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit Xt, fit_params = self._fit(X, y, **fit_params) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit **fit_params_steps[name]) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__ return self.func(*args, **kwargs) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one res = transformer.fit_transform(X, y, **fit_params) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1381, in fit_transform X = super(TfidfVectorizer, self).fit_transform(raw_documents) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 869, in fit_transform self.fixed_vocabulary_) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 792, in _count_vocab for feature in analyze(doc): File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 266, in <lambda> tokenize(preprocess(self.decode(doc))), stop_words) File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 232, in <lambda> return lambda x: strip_accents(x.lower())
You cannot use TF-IDF-related methods for numeric data; the method is exclusively for use with text data, hence it uses methods such as .tolower(), which are by default applicable to strings, hence the error. This is already apparent from the documentation: fit(self, raw_documents, y=None) Learn vocabulary and idf from training set. Parameters raw_documents: iterable An iterable which yields either str, unicode or file objects. I am afraid that your rationale, as explained in the comments: I'm just trying to get the probability that each prediction is correct and TF-IDF seems to be the only way to do so when using SVM is extremely weak. For starters, there is no such thing as "the probability that each prediction is correct" - I take it that you mean probabilistic predictions, in contrast to hard class predictions (see Predict classes or class probabilities?) To get to the point of your actual requirement: in contrast to LinearSVC, which you are using here, SVC does indeed provide a predict_proba method, which should do the job (see the docs and the instructions therein). Notice that LinearSVC is not actually an SVM - see answer in Under what parameters are SVC and LinearSVC in scikit-learn equivalent? for details. In short, forget about TF-IDF and switch to SVC instead of LinearSVC.
Error in handling shape and fit of training set while using Multinomial Naive Bayes for Text classification
I have done all the preprocessing task like removing stopwords,HTML tags etc. I am trying to classify IMDB movie dataset(Large Movie Review Datasets of Stanford Unoversity) using Multinomial Naive Bayes. I am getting error on varibable X. I have made into 2D array but don't know how to handle error? This is part of code of Multinomial Naive Bayes. categories = ['pos','neg'] doc_to_train = sklearn.datasets.load_files("/home/satyam/aclImdb_v1/aclImdb/train", description = None, categories = categories ,load_content=True,enco ding='utf-8',shuffle=True,random_state=42) vectorizer = CountVectorizer() X = (vectorizer.fit_transform(tokens).toarray()) analyze = vectorizer.build_analyzer() vect = vectorizer.get_feature_names() y = np.array(doc_to_train.target) X = X.reshape() X = X.transpose() print (X) X_train, X_test, y_train,y_test= train_test_split(X,y, test_size=0.3) mnb=MultinomialNB().fit(X_train,y_train).predict(X_test) print ("MNB " %mnb) print ("Prediction " %mnb.predict(X_test)) accuracy = mnb.score(X_test, y_test) print ("Accuracy " %accuracy) Error encountered are Traceback (most recent call last): File "sentiment_analysis_NB.py", line 92, in <module> X = (vectorizer.fit_transform(tokens).toarray()) File "/usr/lib/python3.6/site-packages/scipy/sparse/compressed.py", line 943, in toarray out = self._process_toarray_args(order, out) File "/usr/lib/python3.6/site-packages/scipy/sparse/base.py", line 1130, in _process_toarray_args return np.zeros(self.shape, dtype=self.dtype, order=order) MemoryError
scikit-learn ValueError: dimension mismatch
This is my first time posting here. For the past couple of days I have been trying to teach myself scikit-learn. But recently I have encountered an error that has been nagging me for quite some time. My goal is simply to train a NB classifier cli so that I can feed it an arbitrary list of strings called new_doc and it will predict what class the string is likely to belong to. This is what my program looks like: #Importing stuff import numpy as np import pylab import pandas as pd from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer from sklearn import metrics #Opening the csv file df = pd.read_csv('data.csv', sep=',') #Randomising the rows in the file df = df.reindex(np.random.permutation(df.index)) #Extracting features from text, define target y and data X vect = CountVectorizer() X = vect.fit_transform(df['Features']) y = df['Target'] #Partitioning the data into test and training set SPLIT_PERC = 0.75 split_size = int(len(y)*SPLIT_PERC) X_train = X[:split_size] X_test = X[split_size:] y_train = y[:split_size] y_test = y[split_size:] #Training the model clf = MultinomialNB() clf.fit(X_train, y_train) #Evaluating the results print "Accuracy on training set:" print clf.score(X_train, y_train) print "Accuracy on testing set:" print clf.score(X_test, y_test) y_pred = clf.predict(X_test) print "Classification Report:" print metrics.classification_report(y_test, y_pred) #Predicting new data new_doc = ["MacDonalds", "Walmart", "Target", "Starbucks"] trans_doc = vect.transform(new_doc) #extracting features y_pred = clf.predict(trans_doc) #predicting But when I run the program I get the following error on the last row: y_pred = clf.predict(trans_doc) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 62, in predict jll = self._joint_log_likelihood(X) File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 441, in _joint_log_likelihood return (safe_sparse_dot(X, self.feature_log_prob_.T) File "/Library/Python/2.7/site-packages/sklearn/utils/extmath.py", line 175, in safe_sparse_dot ret = a * b File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/sparse/base.py", line 334, in __mul__ raise ValueError('dimension mismatch') ValueError: dimension mismatch So apparently it has something to do with the dimension of the term-document matrixes. When I check the dimensions of trans_doc, X_train and X_test i get: >>> trans_doc.shape (4, 4) >>> X_train.shape (145314, 28750) >>> X_test.shape (48439, 28750) In order for y_pred = clf.predict(trans_doc) to work I need to (from what I understand it) transform new_doc into a term-document matrix with the dimensions (4, 28750). But I don't know of any methods within CountVectorizer that lets me do this.