predict khmer with multinomialNB (scikit-learn) - python

I'm trying to make a classifier with scikit-learn on python, to predict if a nucleotide sequence of virus is potentially pathogenic for human. I identified some sequence as pathogenic with 0 and not with 1, sequence are differentiated from sequence to class with tabulation like this:
ATCGATCGAATCGGATC 1
ATCGGGGGATATATAAATTACATATATTGTTGTATG 1
ATCGTAT 0
ATAAATATTGTATTGCG 0
...
I essentially got my work from Krish Naik https://github.com/krishnaik06/DNA-Sequencing-Classifier/blob/master/DNA%20Sequencing%20and%20applying%20Classifier.ipynb where he tried to predict protein class. I simply modified to fit with my goal, but the problem is I don't find any solutions to predict the pathogenicity of a new sequence.
You can find the data I used on my gitlab https://gitlab.com/MasterBioinformaticBiostatistics/bioinformatic/virusid.git .
Here is, the code of Krish Naik I used with my data (this part seems to work as a model has been built):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Code modified from Krish Naik https://github.com/krishnaik06/DNA-Sequencing-Classifier/blob/master/DNA%20Sequencing%20and%20applying%20Classifier.ipynb
human_data = pd.read_table('final_petit.txt')
human_data.head()
from IPython.display import Image
Image("Capture1.PNG")
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
human_data['words'] = human_data.apply(lambda x: getKmers(x['sequence']), axis=1)
human_data = human_data.drop('sequence', axis=1)
human_data.head()
human_texts = list(human_data['words'])
for item in range(len(human_texts)):
human_texts[item] = ' '.join(human_texts[item])
y_data = human_data.iloc[:, 0].values
print(human_texts[2])
y_data
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
X = cv.fit_transform(human_texts)
print(X.shape)
human_data['class'].value_counts().sort_index().plot.bar()
# Splitting the human dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
y_data,
test_size = 0.20,
random_state=42)
print(X_train.shape)
print(X_test.shape)
### Multinomial Naive Bayes Classifier ###
# The alpha parameter was determined by grid search previously
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
accuracy = accuracy_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
f1 = f1_score(y_test, y_predicted, average='weighted')
return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
In order to predict the new sequence I decided to follow the same method, get and count the khmer words of my new sequence:
#NEW SEQUENCE
new_seq = pd.read_table('sequence.txt')
new_seq.head()
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
new_seq = new_seq.drop('sequence', axis=1)
new_seq.head()
new_seqtext = list(new_seq['words'])
for item in range(len(new_seqtext)):
new_seqtext[item] = ' '.join(new_seqtext[item])
y_data = new_seq.iloc[:, 0].values
y_data
# Creating the Bag of Words model using CountVectorizer()
# This is equivalent to k-mer counting
# The n-gram size of 4 was previously determined by testing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
U = cv.fit_transform(new_seqtext)
u_pred = classifier.predict(U)
But the prediction using classifier.predict seems to not work as wanted, even if the sequence has been cut an count in khmer.
Traceback (most recent call last):
File "Original.py", line 89, in <module>
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/frame.py", line 6878, in apply
return op.get_result()
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/apply.py", line 186, in get_result
return self.apply_standard()
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/apply.py", line 295, in apply_standard
result = libreduction.compute_reduction(
File "pandas/_libs/reduction.pyx", line 618, in pandas._libs.reduction.compute_reduction
File "pandas/_libs/reduction.pyx", line 128, in pandas._libs.reduction.Reducer.get_result
File "Original.py", line 89, in <lambda>
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 4419, in get_value
raise e1
File "/home/name/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas/_libs/index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas/_libs/index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1618, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1626, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'sequence'
The program seems to crashed during this step:
new_seq['words'] = new_seq.apply(lambda x: getKmers(x['sequence']), axis=1)
Is my approach too naive to get khmer of a new sequence?

Solution by a teammate has been found :
Instead of using
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
U = cv.fit_transform(new_seqtext)
You may use this:
U = cv.transform(new_seqtext)
fit_transform is used for building the model as I understood so far, and the no need to import again and cv has always been declared...

Related

I want to use GridSearchCV to find best parameters and predict two methods

My code is below:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_moons
n = 300
n_tr = 200
X, y = make_moons(n, shuffle=True, noise=0.2, random_state=112)
y[y==0] = -1
# standardise the data
trmean = np.mean(X[:n_tr, :], axis=0)
trvar = np.var(X[:n_tr, :], axis=0)
X = (X - trmean[np.newaxis, :]) / np.sqrt(trvar)[np.newaxis, :]
# take first n_tr as training, others as test.
Xtr = X[:n_tr, :]
ytr = y[:n_tr]
Xt = X[n_tr:, :]
yt = y[n_tr:]
# the parameters to consider in cross-validation
# order lasso params from largest to smallest so that if ties occur CV will select the one with more sparsity
params_lasso = [1, 1e-1, 1e-2, 1e-3, 1e-4]
params_rbf = [1e-3, 1e-2, 1e-1, 1, 10, 100]
params_svm = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000] # smaller C: more regularisation
# the noisy features to be considered; for n noisy features, add the first n columns
n_tot_noisy_feats = 50
np.random.seed(92)
X_noise = 2*np.random.randn(X.shape[0], n_tot_noisy_feats)
Xtr_noise = X_noise[:n_tr, :]
Xt_noise = X_noise[n_tr:, :]
svm_accs=[]
lasso_accs=[]
lasso_n_feats=[]
amount_of_noise_feature=np.arange(n_tot_noisy_feats+1)
k=5
for n_nosie_feats in amount_of_noise_feature:
print(n_nosie_feats)
Xtr_noisy = np.copy(Xtr)
Xt_nosiy = np.copy(Xt)
if n_nosie_feats > 0:
Xtr_noisy = np.hstack((Xtr_noisy, Xtr_noise[:,:n_nosie_feats]))
Xt_nosiy = np.hstack((Xt_nosiy, Xt_noise[:,:n_nosie_feats]))
lasso_valid_accs = np.zeros((k, len(params_lasso)))
svm_valid_accs = np.zeros((k, len(params_rbf), len(params_svm)))
from sklearn.model_selection import GridSearchCV
cscv_lasso=GridSearchCV(Lasso(),{"alpha":params_lasso},scoring='accuracy',cv=KFold(5))
cscv_lasso.fit(Xtr_noisy,ytr)
cscv_lassopreds=np.sign(cscv_lasso.best_estimator_.predict(Xt_nosiy))
cscv_svm = GridSearchCV(SVC(), {"C": params_svm,"gamma":params_rbf},scoring='accuracy', cv=KFold(5))
cscv_svm.fit(Xtr_noisy, ytr)
Which results in this error message:
UserWarning,
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py:700: UserWarning: Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_scorer.py", line 200, in __call__
sample_weight=sample_weight)
File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_scorer.py", line 243, in _score
**self._kwargs)
File "/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
"and {1} targets".format(type_true, type_pred))
ValueError: Classification metrics can't handle a mix of binary and continuous targets
What is the lowest amount of noisy features added to the original data, when Lasso accuracy on test set is better than SVM?
It seems that the the predict is weird.
best lasso parameters: {'alpha': 0.1}
best svm params: {'C': 100, 'gamma': 0.001}
I want to find the best param for Lasso and SVC, and the corresponding accuracy_score when noisy features added to the original data.

Cross validation inconsistent numbers of samples error (Python)

I am trying to make a classification using cross validation method and SVM classifier. In my data file, the last column contains my classes (which are 0, 1, 2, 3, 4, 5) and the rest (except first column) is the numeric data that I want to use to predict these classes.
from sklearn import svm
from sklearn import metrics
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
filename = "Features.csv"
dataset = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=range(1, 39))
x = dataset[:, 0:36]
y = dataset[:, 36]
print("len(x): " + str(len(x)))
print("len(y): " + str(len(x)))
skf = StratifiedKFold(n_splits=10, shuffle=False, random_state=42)
modelsvm = svm.SVC()
expected = y
print("len(expected): " + str(len(expected)))
predictedsvm = cross_val_score(modelsvm, x, y, cv=skf)
print("len(predictedsvm): " + str(len(predictedsvm)))
svm_results = metrics.classification_report(expected, predictedsvm)
print(svm_results)
And I am getting such an error:
len(x): 2069
len(y): 2069
len(expected): 2069
C:\Python\Python37\lib\site-packages\sklearn\model_selection\_split.py:297: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
len(predictedsvm): 10
Traceback (most recent call last):
File "C:/Users/MyComp/PycharmProjects/GG/AR.py", line 54, in <module>
svm_results = metrics.classification_report(expected, predictedsvm)
File "C:\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 73, in inner_f
return f(**kwargs)
File "C:\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 1929, in classification_report
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "C:\Python\Python37\lib\site-packages\sklearn\metrics\_classification.py", line 81, in _check_targets
check_consistent_length(y_true, y_pred)
File "C:\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 257, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [2069, 10]
Process finished with exit code 1
I don't understand how my data count in y goes down to 10 when I am trying to predict it using CV.
Can anyone help me on this please?
You are misunderstanding the output from cross_val_score. As per the documentation it returns "array of scores of the estimator for each run of the cross validation," not actual predictions. Because you have 10 folds, you get 10 values.
classification_report expects the true values and the predicted values. To use this, you'll want to predict with a model. To do this, you'll need to fit the model on the data. If you're happy with the results from cross_val_score you can train that model on the data. Or, you can use GridSearchCV to do this all in one sweep.

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

I am trying to predict using SVM but I receive the error
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
when executing line text_clf.fit(X_train,y_train) of my code. How to fix this and get the probability that my prediction is correct using SVM?
I am predicting the first column (gold) of my input file based on the values of the remaining columns. My input file dataExtended.txtis under the form:
gold,T-x-T,T-x-N,T-x-U,T-x-NT,T-x-UT,T-x-UN,T-x-UNT,N-x-T,N-x-N,N-x-U,N-x-NT,N-x-UT,N-x-UN,N-x-UNT,U-x-T,U-x-N,U-x-U,U-x-NT,U-x-UT,U-x-UN,U-x-UNT,NT-x-T,NT-x-N,NT-x-U,NT-x-NT,NT-x-UT,NT-x-UN,NT-x-UNT,UT-x-T,UT-x-N,UT-x-U,UT-x-NT,UT-x-UT,UT-x-UN,UT-x-UNT,UN-x-T,UN-x-N,UN-x-U,UN-x-NT,UN-x-UT,UN-x-UN,UN-x-UNT,UNT-x-T,UNT-x-N,UNT-x-U,UNT-x-NT,UNT-x-UT,UNT-x-UN,UNT-x-UNT,T-T-x,T-N-x,T-U-x,T-NT-x,T-UT-x,T-UN-x,T-UNT-x,N-T-x,N-N-x,N-U-x,N-NT-x,N-UT-x,N-UN-x,N-UNT-x,U-T-x,U-N-x,U-U-x,U-NT-x,U-UT-x,U-UN-x,U-UNT-x,NT-T-x,NT-N-x,NT-U-x,NT-NT-x,NT-UT-x,NT-UN-x,NT-UNT-x,UT-T-x,UT-N-x,UT-U-x,UT-NT-x,UT-UT-x,UT-UN-x,UT-UNT-x,UN-T-x,UN-N-x,UN-U-x,UN-NT-x,UN-UT-x,UN-UN-x,UN-UNT-x,UNT-T-x,UNT-N-x,UNT-U-x,UNT-NT-x,UNT-UT-x,UNT-UN-x,UNT-UNT-x,x-T-T,x-T-N,x-T-U,x-T-NT,x-T-UT,x-T-UN,x-T-UNT,x-N-T,x-N-N,x-N-U,x-N-NT,x-N-UT,x-N-UN,x-N-UNT,x-U-T,x-U-N,x-U-U,x-U-NT,x-U-UT,x-U-UN,x-U-UNT,x-NT-T,x-NT-N,x-NT-U,x-NT-NT,x-NT-UT,x-NT-UN,x-NT-UNT,x-UT-T,x-UT-N,x-UT-U,x-UT-NT,x-UT-UT,x-UT-UN,x-UT-UNT,x-UN-T,x-UN-N,x-UN-U,x-UN-NT,x-UN-UT,x-UN-UN,x-UN-UNT,x-UNT-T,x-UNT-N,x-UNT-U,x-UNT-NT,x-UNT-UT,x-UNT-UN,x-UNT-UNT,callersAtLeast1T,CalleesAtLeast1T,callersAllT,calleesAllT,CallersAtLeast1N,CalleesAtLeast1N,CallersAllN,CalleesAllN,childrenAtLeast1T,parentsAtLeast1T,childrenAtLeast1N,parentsAtLeast1N,childrenAllT,parentsAllT,childrenAllN,ParentsAllN,ParametersatLeast1T,FieldMethodsAtLeast1T,ReturnTypeAtLeast1T,ParametersAtLeast1N,FieldMethodsAtLeast1N,ReturnTypeN,ParametersAllT,FieldMethodsAllT,ParametersAllN,FieldMethodsAllN,ClassGoldN,ClassGoldT,Inner,Leaf,Root,Isolated,EmptyCallers,EmptyCallees,EmptyCallersCallers,EmptyCalleesCallees,Program,Requirement,MethodID
T,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,6,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,7,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,8,1
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,1,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,2,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,3,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,chess,4,3
N,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,chess,5,3
Here is my full reproducible code:
# Make Predictions with Naive Bayes On The Iris Dataset
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
data = pd.read_csv( 'dataExtended.txt', sep= ',')
row_count, column_count = data.shape
# Printing the dataswet shape
print ("Dataset Length: ", len(data))
print ("Dataset Shape: ", data.shape)
print("Number of columns ", column_count)
# Printing the dataset obseravtions
print ("Dataset: ",data.head())
data['gold'] = data['gold'].astype('category').cat.codes
data['Program'] = data['Program'].astype('category').cat.codes
# Building Phase Separating the target variable
X = data.values[:, 1:column_count]
Y = data.values[:, 0]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
#Create a svm Classifier
svclassifier = svm.LinearSVC()
print('Before fitting')
svclassifier.fit(X_train, y_train)
predicted = svclassifier.predict(X_test)
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(X_train,y_train)
Traceback leading to error:
Traceback (most recent call last):
File "<ipython-input-9-8e85a0a9f81c>", line 1, in <module>
runfile('C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py', wdir='C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python')
File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 668, in runfile
execfile(filename, namespace)
File "C:\Users\mouna\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/mouna/ownCloud/Mouna Hammoudi/dumps/Python/Paper4SVM.py", line 53, in <module>
text_clf.fit(X_train,y_train)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1381, in fit_transform
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 792, in _count_vocab
for feature in analyze(doc):
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 266, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "C:\Users\mouna\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 232, in <lambda>
return lambda x: strip_accents(x.lower())
You cannot use TF-IDF-related methods for numeric data; the method is exclusively for use with text data, hence it uses methods such as .tolower(), which are by default applicable to strings, hence the error. This is already apparent from the documentation:
fit(self, raw_documents, y=None)
Learn vocabulary and idf from training set.
Parameters
raw_documents: iterable
An iterable which yields either str, unicode or file objects.
I am afraid that your rationale, as explained in the comments:
I'm just trying to get the probability that each prediction is correct and TF-IDF seems to be the only way to do so when using SVM
is extremely weak. For starters, there is no such thing as "the probability that each prediction is correct" - I take it that you mean probabilistic predictions, in contrast to hard class predictions (see Predict classes or class probabilities?)
To get to the point of your actual requirement: in contrast to LinearSVC, which you are using here, SVC does indeed provide a predict_proba method, which should do the job (see the docs and the instructions therein). Notice that LinearSVC is not actually an SVM - see answer in Under what parameters are SVC and LinearSVC in scikit-learn equivalent? for details.
In short, forget about TF-IDF and switch to SVC instead of LinearSVC.

Error in handling shape and fit of training set while using Multinomial Naive Bayes for Text classification

I have done all the preprocessing task like removing stopwords,HTML tags etc.
I am trying to classify IMDB movie dataset(Large Movie Review Datasets of Stanford Unoversity) using Multinomial Naive Bayes. I am getting error on varibable X. I have made into 2D array but don't know how to handle error?
This is part of code of Multinomial Naive Bayes.
categories = ['pos','neg']
doc_to_train = sklearn.datasets.load_files("/home/satyam/aclImdb_v1/aclImdb/train", description = None, categories = categories ,load_content=True,enco ding='utf-8',shuffle=True,random_state=42)
vectorizer = CountVectorizer()
X = (vectorizer.fit_transform(tokens).toarray())
analyze = vectorizer.build_analyzer()
vect = vectorizer.get_feature_names()
y = np.array(doc_to_train.target)
X = X.reshape()
X = X.transpose()
print (X)
X_train, X_test, y_train,y_test= train_test_split(X,y, test_size=0.3)
mnb=MultinomialNB().fit(X_train,y_train).predict(X_test)
print ("MNB " %mnb)
print ("Prediction " %mnb.predict(X_test))
accuracy = mnb.score(X_test, y_test)
print ("Accuracy " %accuracy)
Error encountered are
Traceback (most recent call last):
File "sentiment_analysis_NB.py", line 92, in <module>
X = (vectorizer.fit_transform(tokens).toarray())
File "/usr/lib/python3.6/site-packages/scipy/sparse/compressed.py", line 943, in toarray
out = self._process_toarray_args(order, out)
File "/usr/lib/python3.6/site-packages/scipy/sparse/base.py", line 1130, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError

scikit-learn ValueError: dimension mismatch

This is my first time posting here. For the past couple of days I have been trying to teach myself scikit-learn. But recently I have encountered an error that has been nagging me for quite some time.
My goal is simply to train a NB classifier cli so that I can feed it an arbitrary list of strings called new_doc and it will predict what class the string is likely to belong to.
This is what my program looks like:
#Importing stuff
import numpy as np
import pylab
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn import metrics
#Opening the csv file
df = pd.read_csv('data.csv', sep=',')
#Randomising the rows in the file
df = df.reindex(np.random.permutation(df.index))
#Extracting features from text, define target y and data X
vect = CountVectorizer()
X = vect.fit_transform(df['Features'])
y = df['Target']
#Partitioning the data into test and training set
SPLIT_PERC = 0.75
split_size = int(len(y)*SPLIT_PERC)
X_train = X[:split_size]
X_test = X[split_size:]
y_train = y[:split_size]
y_test = y[split_size:]
#Training the model
clf = MultinomialNB()
clf.fit(X_train, y_train)
#Evaluating the results
print "Accuracy on training set:"
print clf.score(X_train, y_train)
print "Accuracy on testing set:"
print clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print "Classification Report:"
print metrics.classification_report(y_test, y_pred)
#Predicting new data
new_doc = ["MacDonalds", "Walmart", "Target", "Starbucks"]
trans_doc = vect.transform(new_doc) #extracting features
y_pred = clf.predict(trans_doc) #predicting
But when I run the program I get the following error on the last row:
y_pred = clf.predict(trans_doc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 62, in predict
jll = self._joint_log_likelihood(X)
File "/Library/Python/2.7/site-packages/sklearn/naive_bayes.py", line 441, in _joint_log_likelihood
return (safe_sparse_dot(X, self.feature_log_prob_.T)
File "/Library/Python/2.7/site-packages/sklearn/utils/extmath.py", line 175, in safe_sparse_dot
ret = a * b
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/sparse/base.py", line 334, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
So apparently it has something to do with the dimension of the term-document matrixes.
When I check the dimensions of trans_doc, X_train and X_test i get:
>>> trans_doc.shape
(4, 4)
>>> X_train.shape
(145314, 28750)
>>> X_test.shape
(48439, 28750)
In order for y_pred = clf.predict(trans_doc) to work I need to (from what I understand it) transform new_doc into a term-document matrix with the dimensions (4, 28750). But I don't know of any methods within CountVectorizer that lets me do this.

Categories

Resources