Predict movie reviews with scikit-learn - python

I'm using scikit-learn MultinomialNB and Vectorizer to build a prediction model of whether the review is good or bad.
After training on the labelled data, how do I use it to predict new reviews (or existing review)? I'm getting the error message below.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
X = vectorizer.fit_transform(df.quote)
X = X.tocsc()
Y = (df.fresh == 'fresh').values.astype(np.int)
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)
clf = MultinomialNB().fit(xtrain, ytrain)
new_review = ['this is a new review, movie was awesome']
new_review = vectorizer.fit_transform(new_review)
print df.quote[15]
print(clf.predict(df.quote[10])) #predict existing review in dataframe
print(clf.predict(new_review)) #predict new review
Technically, Toy Story is nearly flawless.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-91-27a0698bbd1f> in <module>()
15
16 print df.quote[15]
---> 17 print(clf.predict(df.quote[10])) #predict existing quote in dataframe
18 print(clf.predict(new_review)) #predict new review
//anaconda/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in predict(self, X)
60 Predicted target values for X
61 """
---> 62 jll = self._joint_log_likelihood(X)
63 return self.classes_[np.argmax(jll, axis=1)]
64
//anaconda/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _joint_log_likelihood(self, X)
439 """Calculate the posterior log probability of the samples X"""
440 X = atleast2d_or_csr(X)
--> 441 return (safe_sparse_dot(X, self.feature_log_prob_.T)
442 + self.class_log_prior_)
443
//anaconda/lib/python2.7/site-packages/sklearn/utils/extmath.pyc in safe_sparse_dot(a, b, dense_output)
178 return ret
179 else:
--> 180 return fast_dot(a, b)
181
182
TypeError: Cannot cast array data from dtype('float64') to dtype('S32') according to the rule 'safe'

You need to pass a Bag of Words representation to predict and not the text directly. You are doing it almost correctly with new_review, only change new_review = vectorizer.transform(new_review), (see #Stergios comment) . Try this:
print(clf.predict(X[10, :]))

Related

Using forestci to create error bars for random forest regression algorithms

I am using a program called GALPRO to implement a random forest regression algorithm to predict photometric redshift estimates. It uses a random forest algorithm as a method of machine learning. I input testing and training data. I use x_train (dimensions = [90,13]), x_train (dimensions = [10,13]) y_train (dimensions = [90,2]) and y_test (dimensions = [10,2]).
The code below shows how GALPRO does the random forest regression calculation:
model = RandomForestRegressor(**self.params)
model.fit(x_train, y_train)
I then make point estimate predictions using:
# Use the model to make predictions on new objects
y_pred = model.predict(x_test)
I am then trying to create error estimates using the forestci package random_forest_error:
y_error = fci.random_forest_error(model, x_train, x_test)
However I get an error:
ValueError Traceback (most recent call last)
/tmp/ipykernel_2626600/1096083143.py in <module>
----> 1 point_estimates = model.point_estimate(save_estimates=True, make_plots=False)
2 print(point_estimates)
/scratch/wiay/lara/galpro/galpro/model.py in point_estimate(self, save_estimates, make_plots)
158 # Use the model to make predictions on new objects
159 y_pred = self.model.predict(self.x_test)
--> 160 y_error = fci.random_forest_error(self.model, self.x_train, self.x_test)
161
162 # Update class variables
~/.local/lib/python3.7/site-packages/forestci/forestci.py in random_forest_error(forest, X_train, X_test, inbag, calibrate, memory_constrained, memory_limit)
279 n_trees = forest.n_estimators
280 V_IJ = _core_computation(
--> 281 X_train, X_test, inbag, pred_centered, n_trees, memory_constrained, memory_limit
282 )
283 V_IJ_unbiased = _bias_correction(V_IJ, inbag, pred_centered, n_trees)
~/.local/lib/python3.7/site-packages/forestci/forestci.py in _core_computation(X_train, X_test, inbag, pred_centered, n_trees, memory_constrained, memory_limit, test_mode)
135 """
136 if not memory_constrained:
--> 137 return np.sum((np.dot(inbag - 1, pred_centered.T) / n_trees) ** 2, 0)
138
139 if not memory_limit:
<__array_function__ internals> in dot(*args, **kwargs)
ValueError: shapes (90,100) and (100,10,2) not aligned: 100 (dim 1) != 10 (dim 1)
I'm not sure what this error means or why my dimensions are wrong as I am following a similar example. If anyone has any ideas please let me know!

must be a structured array with the first field being a binary class RandomSurvivalForest

I am not sure why I am getting the following error.
y must be a structured array with the first field being a binary class event indicator and the second field the time of the event/censoring
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<command-2136632501118727> in <module>
5 n_jobs=-1,
6 random_state=0)
----> 7 rsf.fit(X_train, y_train)
/databricks/python/lib/python3.8/site-packages/sksurv/ensemble/forest.py in fit(self, X, y, sample_weight)
235 self
236 """
--> 237 X, event, time = check_arrays_survival(X, y)
238
239 self.n_features_ = X.shape[1]
/databricks/python/lib/python3.8/site-packages/sksurv/util.py in check_arrays_survival(X, y, **kwargs)
192 Time of event or censoring.
193 """
--> 194 event, time = check_y_survival(y)
195 kwargs.setdefault("dtype", numpy.float64)
196 X = check_array(X, ensure_min_samples=2, **kwargs)
/databricks/python/lib/python3.8/site-packages/sksurv/util.py in check_y_survival(y_or_event, allow_all_censored, *args)
132
133 if not isinstance(y, numpy.ndarray) or y.dtype.fields is None or len(y.dtype.fields) != 2:
--> 134 raise ValueError('y must be a structured array with the first field'
135 ' being a binary class event indicator and the second field'
136 ' the time of the event/censoring')
ValueError: y must be a structured array with the first field being a binary class event indicator and the second field the time of the event/censoring
I have tried converting the datatype to bool. as well and converting to array
my data looks like this:
below day_of_quarter
0 0 87
1 1 38
2 0 18
3 1 84
4 0 64
and here is my code useing a sklearn survival package. The data should be set up for a survival analysis.
from sksurv.ensemble import RandomSurvivalForest
df = data.select(col('below'),col('day_of_quarter')).toPandas()
x = df.day_of_quarter
y = df.below.astype(bool)
X_train, X_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=0)
rsf = RandomSurvivalForest(n_estimators=1000,
min_samples_split=10,
min_samples_leaf=15,
max_features="sqrt",
n_jobs=-1,
random_state=0)
rsf.fit(X_train, y_train)
Assuming these are your censor and temporal data for RSF:
import pandas as pd
below = [0,1,0,1,0]
day_of_quarter = [87, 38, 18, 84, 64]
your_data = pd.DataFrame({'below':below,day_of_quarter':day_of_quarter})
your_data['below'] = your_data['below'].astype(bool)
your_data
You need to create a structured array, that has two fields. You can use to_records() function:
y = your_data.to_records(index=False)
y
This is how your 'y' must look like in RandomSurvivalForest.
You haven't provided information on 'x' data, but it must be your input features per patient. It will be a matrix with columns - features (age, sex), rows - patient (1,2,3 patient).

Using a trained sentiment analysis model, TF-IDF and logistic regression

I'm doing a sentiment analysis project on a Twitter dataset. I used TF-IDF feature extraction and a logistic regression model for classification. So far I've trained the model with the following:
def get_tfidf_features(train_fit, ngrams=(1,1)):
vector = TfidfVectorizer(ngrams, sublinear_tf=True)
vector.fit(train_fit)
return vector
X = tf_vector.transform(df['text'])
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.01, random_state = 42)
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
This logistic regression model was trained on a dataset of about 1.5 million tweets. I have a set of about 18,000 tweets and I want to use this model to predict the sentiment scores for the tweets in this new dataset. I'm at a loss of how to actually apply this trained model to new data. The head of this new dataframe df_chi looks like this:
which has shape (18393, 7). I want to take the trained model I already have, apply it to the text column, and create a new sentiment column with those predicted scores in the df_chi dataframe. (Note: the image doesn't show cleaned text, but I'll do that.)
I'm a ML noob and I've never taken a trained model and applied it to new data. My confusion starts with extracting features from the df_chi text with TF-IDF. I attempted to do this (total guess):
tf_vector = get_tfidf_features(df_chi['text'])
X = tf_vector.transform(df_chi['text'])
df_chi['sentiment'] = LR_model.predict(X)
which gives the following ValueError:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-188-0cf1a4f34c8b> in <module>
1 tf_vector = get_tfidf_features(df_chi['text'])
2 X = tf_vector.transform(df_chi['text'])
----> 3 df_chi['sentiment'] = LR_model.predict(X)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_base.py in predict(self, X)
291 Predicted class label per sample.
292 """
--> 293 scores = self.decision_function(X)
294 if len(scores.shape) == 1:
295 indices = (scores > 0).astype(np.int)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_base.py in decision_function(self, X)
271 if X.shape[1] != n_features:
272 raise ValueError("X has %d features per sample; expecting %d"
--> 273 % (X.shape[1], n_features))
274
275 scores = safe_sparse_dot(X, self.coef_.T,
ValueError: X has 22806 features per sample; expecting 265054
Pretty sure my whole approach to applying the trained model on the new data is incorrect. What's the right way to do this?
Noodled around with this and came up with the following solution:
tfidf = TfidfVectorizer()
X_chi = tfidf.fit_transform(df_chi['text'])
X1 = pd.DataFrame.sparse.from_spmatrix(X)
X_chi1 = pd.DataFrame.sparse.from_spmatrix(X_chi)
not_existing_cols = [c for c in X1.columns.tolist() if c not in X_chi1]
X_chi1 = X_chi1.reindex(X_chi1.columns.tolist() + not_existing_cols, axis=1)
#X_chi.fillna(0, inplace=True)
X_chi1 = X_chi1[X1.columns.tolist()]
a = LR_model.predict(X_chi1)
df_chi['sentiment'] = a
Solution inspired by Logistic regression: X has 667 features per sample; expecting 74869
Looks a little clumsy, though. If it works it works, I guess. Though I suspect there might be a better way to do this, no?

using bagging algorithm with multiple models

i am trying to built a model for LasVegasTripAdvisorReviews-Dataset
using bagging algorithm ,
i have an error (Multilabel and multi-output classification is not supported)
can you please help me and tell me how to solve the error )
regards
the attachment contain link to lasvegas dataset LasVegasTripAdvisorReviews-Dataset
# Voting Ensemble for Classification
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier,GradientBoostingClassifier,AdaBoostClassifier,RandomForestClassifier
url = "h:/LasVegasTripAdvisorReviews-Dataset.csv"
names = ['User country','Nr. reviews','Nr. hotel reviews','Helpful votes','Period of stay','Traveler type','Pool','Gym','Tennis court','Spa','Casino','Free internet','Hotel name','Hotel stars','Nr. rooms','User continent','Member years','Review month','Review weekday','Score']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,:]
Y = array[:,:]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = AdaBoostClassifier()
estimators.append(('AdaBoost', model1))
model2 = GradientBoostingClassifier()
estimators.append(('GradientBoosting', model2))
model3 = RandomForestClassifier()
estimators.append(('RandomForest', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())
Stacktrace:
NotImplementedError Traceback (most recent call last)
<ipython-input-9-bda887b4022f> in <module>
27 # create the ensemble model
28 ensemble = VotingClassifier(estimators)
---> 29 results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
30 print(results.mean())
/usr/local/lib/python3.5/dist-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
...
...
NotImplementedError: Multilabel and multi-output classification is not supported.
You have the line:
X = array[:,:]
Y = array[:,:]
Meaning that your feature matrix (X) and target vector (Y) are the same.
You need to chose only one column to be your Y.
For example, let's suppose your want your last column to be Y.
Then, you should change the above lines to this:
X = values[:,:-1]
Y = values[:,-1:]
This should solve the error you got. The error you have basically means: I don't support more than one column in Y.

Positional argument error when trying to train SGDClassifier on binary classification

I'm working through Aurelien Geron's Hands-On ML textbook and have got stuck trying to train an SGDClassifier.
I'm using the MNIST handwritten numbers data and running my code in a Jupyter Notebook via Anaconda. Both my anaconda (1.7.0) and sklearn (0.20.dev0) are updated. I've pasted the code I used to load the data, select the first 60k rows, shuffle the order and convert the labels to 1 (True) for all 5's and 0 (False) for all other numbers. Both X_train and y_train_5 are numpy arrays.
I've pasted the error message I get below.
Nothing seems to be wrong with the dimensions of the data, I tried converting X_train to a sparse matrix (the suggested format for SGDClassifier) and various max_iter values and got the same error message each time. Am I missing something obvious? Do I need to use a different version of sklearn? I've searched online but couldn't find any posts describing similar issues with SGDClassifier. I'd be super grateful for any kind of pointer.
Code
from six.moves import urllib
from scipy.io import loadmat
import numpy as np
from sklearn.linear_model import SGDClassifier
# Load MNIST data #
from scipy.io import loadmat
mnist_alternative_url = "https://github.com/amplab/datascience-
sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
content = response.read()
f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
# Assign X and y #
X, y = mnist['data'], mnist['target']
# Select first 60000 numbers #
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000],
y[60000:]
# Shuffle order #
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# Convert labels to binary (5 or "not 5") #
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# Train SGDClassifier #
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)
Error Message
---------------------------------------------------------------------------
TypeError
Traceback (most recent call last)
<ipython-input-10-5a25eed28833> in <module>()
37 # Train SGDClassifier
38 sgd_clf = SGDClassifier(max_iter=5, random_state=42)
---> 39 sgd_clf.fit(X_train, y_train_5)
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
712 loss=self.loss, learning_rate=self.learning_rate,
713 coef_init=coef_init, intercept_init=intercept_init,
--> 714 sample_weight=sample_weight)
715
716
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
570
571 self._partial_fit(X, y, alpha, C, loss, learning_rate, self._max_iter,
--> 572 classes, sample_weight, coef_init, intercept_init)
573
574 if (self._tol is not None and self._tol > -np.inf
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, coef_init, intercept_init)
529 learning_rate=learning_rate,
530 sample_weight=sample_weight,
--> 531 max_iter=max_iter)
532 else:
533 raise ValueError(
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _fit_binary(self, X, y, alpha, C, sample_weight, learning_rate, max_iter)
587 self._expanded_class_weight[1],
588 self._expanded_class_weight[0],
--> 589 sample_weight)
590
591 self.t_ += n_iter_ * X.shape[0]
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, pos_weight, neg_weight, sample_weight)
419 pos_weight, neg_weight,
420 learning_rate_type, est.eta0,
--> 421 est.power_t, est.t_, intercept_decay)
422
423 else:
~\Anaconda3\lib\site-packages\sklearn\linear_model\sgd_fast.pyx in sklearn.linear_model.sgd_fast.plain_sgd()
TypeError: plain_sgd() takes at most 21 positional arguments (25 given)
It appears your version of scikit-learn is just a little outdated. Try running:
pip install -U scikit-learn
then your code will run (with some slight formatting updates):
from six.moves import urllib
from scipy.io import loadmat
import numpy as np
from sklearn.linear_model import SGDClassifier
from scipy.io import loadmat
# Load MNIST data #
mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
content = response.read()
f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
# Assign X and y #
X, y = mnist['data'], mnist['target']
# Select first 60000 numbers #
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
# Shuffle order #
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# Convert labels to binary (5 or "not 5") #
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# Train SGDClassifier #
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)

Categories

Resources