I am trying to predict toxic comments using Toxic Comment data from kaggle:
import skmultilearn, sys
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
y = csr_matrix(data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']])
binary_rel_clf = BinaryRelevance(MultinomialNB())
binary_rel_clf.fit(Xfeatures,y)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [y.columns.values[prediction].tolist() for prediction in br_prediction]
print(predictions)
However, I got this error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\base\base.py", line 86, in _ensure_input_format
return X.toarray()
File "...\scipy\sparse\compressed.py", line 1031, in toarray
out = self._process_toarray_args(order, out)
File "...\scipy\sparse\base.py", line 1202, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 226. GiB for an array with shape (159571, 189775) and data type float64
And even if try to pass the param "require_dense=False" I got another error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\skmultilearn\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\sklearn\naive_bayes.py", line 612, in fit
X, y = self._check_X_y(X, y)
File "...\sklearn\naive_bayes.py", line 477, in _check_X_y
return self._validate_data(X, y, accept_sparse='csr')
File "...\sklearn\base.py", line 433, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 826, in check_X_y
y = column_or_1d(y, warn=True)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 864, in column_or_1d
raise ValueError(
ValueError: y should be a 1d array, got an array of shape () instead.
How can I fix that and train using the entire model?
It seems that you specified the required_dense argument incorrectly. You need required_dense=[False, True] in order to specify the X values in sparse format but not the y values. In the second last row (predictions = ...) you need to use y before you convert it to a matrix so you can access the column names.
The following code should work.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
import numpy as np
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
cats = data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
y = csr_matrix(cats)
binary_rel_clf = BinaryRelevance(MultinomialNB(), require_dense = [False, True])
binary_rel_clf.fit(Xfeatures, y) # y[:,0].toarray().reshape(-1)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [cats.columns[prediction].tolist() for prediction in br_prediction]
print(predictions)
Output:
[['toxic', 'obscene', 'insult']]
Related
I'm currently implementing machine learning using SMOTE from imblearn.over_sampling, and as I'm synthesizing data for it, I see a very noticeable cutoff for when the SMOTE method breaks. When I synthesize data using the following code and run it through SMOTE (courtesy of Jason Brownlee):
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=15, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
It works fine. However, when the number of features is 16...
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=16, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
SMOTE breaks. Why is this? Does anyone know of a SMOTE method that works for more than 15 parameters? By SMOTE breaking, I mean I get the error below:
Traceback (most recent call last):
File "\\arete\shared\Los Angeles\Users\Active\bbonifacio\New ADVANCE\untitled1.py", line 13, in <module>
X, y = oversample.fit_resample(X, y)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\imblearn\base.py", line 83, in fit_resample
output = self._fit_resample(X, y)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote\base.py", line 324, in _fit_resample
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
results = PairwiseDistancesArgKmin.compute(
File "sklearn\metrics\_pairwise_distances_reduction.pyx", line 691, in sklearn.metrics._pairwise_distances_reduction.PairwiseDistancesArgKmin.compute
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 151, in threadpool_limits
return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 171, in __init__
self._original_info = self._set_threadpool_limits()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 268, in _set_threadpool_limits
modules = _ThreadpoolInfo(prefixes=self._prefixes,
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 340, in __init__
self._load_modules()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 373, in _load_modules
self._find_modules_with_enum_process_module_ex()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 485, in _find_modules_with_enum_process_module_ex
self._make_module_from_path(filepath)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 606, in __init__
self.version = self.get_version()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
And here are the versions of packages:
Sklearn: 1.1.1
Imblearn: 0.9.1
Threadpoolctl: 2.1.0
Big thanks to rickhg12hs for this answer!
The solution is to update threadpoolctl. It was not working on threadpoolctl on my versin of 2.1.0, but it works on the updated version. If anyone else is having this problem, type
pip install -U threadpoolctl
in your command terminal, and it should be fixed. Happy coding!
When I tried to take the word_vector transformed from Chinese as the feature of sklearn,an error occurred.
The shape of x_train and word_vector are (747,) and (1,100) and the latter's dtype is float64
for this question, I guess the type of the data may be different, but i tried to traverse all the data, it was ok ……
Here are the code:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
import SZ_function as sz
import gensim
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
def remove_stop_words(text):
stop_words = sz.get_step_words('notebook/HIT.txt')
text = text.split()
word_list = ''
for word in text:
if word not in stop_words:
word_list += word
word_list += ' '
return word_list
def pre_process(path):
data = pd.read_excel(path)
data['text'] = data['text'].apply(sz.remove_number_en)
data['text'] = data['text'].apply(sz.cut_words)
data['text'] = data['text'].apply(remove_stop_words)
data = data.replace(to_replace='', value='None')
data = data.replace(to_replace='None', value=np.nan).dropna()
return data
def create_corpus(data):
text = data['text']
return [sentences.split() for sentences in text]
def word_vec(corpus):
model = gensim.models.word2vec.Word2Vec(corpus)
return model
def get_sent_vec(sent,model,size):
vec = np.zeros(size).reshape((1,size))
count = 0
for word in sent[1:]:
try:
vec += model.wv[word].reshape((1,size))
count += 1
except:
continue
if count != 0:
vec /= count
return vec
if __name__ == '__main__':
data = pre_process('datasets_demo.xlsx')
corpus = create_corpus(data)
model = word_vec(corpus)
data['text']=data['text'].apply(get_sent_vec,model=model,size=100)
x_train,y_train,x_test,y_test = train_test_split(data['text'],data['label'])
estimator = MultinomialNB()
estimator.fit(x_train,y_train)
here are the all trackback:
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\12996\AppData\Local\Temp\jieba.cache
Loading model cost 0.628 seconds.
Prefix dict has been built successfully.
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-8366eff678ac>", line 1, in <module>
runfile('C:/Users/12996/Desktop/Tensorflow_/datasets_demo.py', wdir='C:/Users/12996/Desktop/Tensorflow_')
File "E:\pycharm\PyCharm 2022.1\plugins\python\helpers\pydev\_pydev_bundle\pydev_umd.py", line 198, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "E:\pycharm\PyCharm 2022.1\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/12996/Desktop/Tensorflow_/datasets_demo.py", line 66, in <module>
estimator.fit(x_train,y_train)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\naive_bayes.py", line 663, in fit
X, y = self._check_X_y(X, y)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\naive_bayes.py", line 523, in _check_X_y
return self._validate_data(X, y, accept_sparse="csr", reset=reset)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\base.py", line 581, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\utils\validation.py", line 976, in check_X_y
estimator=estimator,
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\sklearn\utils\validation.py", line 746, in check_array
array = np.asarray(array, order=order, dtype=dtype)
File "E:\Anaconda3\envs\tensorflow-gpu\lib\site-packages\pandas\core\series.py", line 857, in __array__
return np.asarray(self._values, dtype)
ValueError: setting an array element with a sequence.
import pandas as pd
import quandl
import math
import numpy as np
from sklearn import preprocessing, model_selection,svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
quandl.ApiConfig.api_key = "FVeuw21FAe86ux3J3ePr"
df=quandl.get("WIKI/GOOGL")
df=df[['Adj. Open','Adj. High','Adj. Low','Adj. Close','Adj. Volume']]
df['HL_PCT']=(df['Adj. High']-df['Adj. Close'])/df['Adj. Close']*100
df['DL_PCT']=(df['Adj. Close']-df['Adj. Open'])/df['Adj. Open']*100
df=df[['Adj. Close','HL_PCT','DL_PCT','Adj. Volume']]
forcast_col='Adj. Close'
df.fillna(-9999,inplace=True)
forcast_out=int(math.ceil(0.01*len(df)))
print(forcast_out)
df['label']=df[forcast_col].shift(-forcast_out)
df.dropna(inplace=True)
X=np.array(df.drop(['label'],1))
y=np.array(df['label'])
X=preprocessing.scale(X)
y=np.array(df['label'])
print (len(X),len(y))
X.shape[0]!=y.shape[0]
X_train,X_text,y_test,y_train=train_test_split(X,y,test_size=0.2)
plt.xlabel('area(m^2)')
plt.ylabel('price(Rs)')
plt.plot(X_train,y_train,color='blue',marker='.')
plt.show()
reg=LinearRegression()
reg.fit(X_train,y_train)
accuracy=reg.score(X_test,y_test)
print(accuracy)
the error i got while running this code
Traceback (most recent call last):
File "c:/Users/user/Desktop/projetcs/machine learning/mc1.py", line 31, in <module>
plt.plot(X_train,y_train,color='blue',marker='.')
File "C:\Users\user\python11\lib\site-packages\matplotlib\pyplot.py", line 2824, in plot
return gca().plot(
File "C:\Users\user\python11\lib\site-packages\matplotlib\axes\_axes.py", line 1743, in plot
lines = [*self._get_lines(*args, data=data, **kwargs)]
File "C:\Users\user\python11\lib\site-packages\matplotlib\axes\_base.py", line 273, in __call__
yield from self._plot_args(this, kwargs)
File "C:\Users\user\python11\lib\site-packages\matplotlib\axes\_base.py", line 399, in _plot_args
raise ValueError(f"x and y must have same first dimension, but "
ValueError: x and y must have same first dimension,
I'm trying to improve a perfectly working Bernoulli Naive Bayes model with bagging.
But when I try to cross-validate the BaggingClassifier, I get a very unexpected ZeroDivisionError coming from parallel.py.
I've tried to change all the parameters I know, rebooted python but nothing worked.
Here is a reproducible example with a binary-modified iris dataset:
#%% run
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.datasets import load_iris
data = load_iris()
data.targetbin = (data.target!=0).astype("int")
param_grid2={'max_samples' : np.linspace(0.5,1.0,3),
'base_estimator__alpha':np.linspace(0.1,1,3),
'base_estimator__binarize':[*np.linspace(0.0,1,3)],
'base_estimator__fit_prior':[True,False]}
param_grid2={'max_samples' :[0.7]}
clf = GridSearchCV(
BaggingClassifier(
BernoulliNB(),
n_estimators = 10, max_features = 0.5),
param_grid2,
scoring = "accuracy",
verbose=-1)
clf.fit(data.data, data.targetbin)
And here is the stacktrace of my error:
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1
concurrent workers. Traceback (most recent call last):
File "", line 33, in
clf.fit(data.data, data.targetbin)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 722, in fit
self._run_search(evaluate_candidates)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 1191, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 711, in evaluate_candidates
cv.split(X, y, groups)))
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 917, in call
if self.dispatch_one_batch(iterator):
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 759, in dispatch_one_batch
self._dispatch(tasks)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib_parallel_backends.py",
line 184, in apply_async
callback(result)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 306, in call
self.parallel.print_progress()
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 806, in print_progress
if (is_last_item or cursor % frequency):
ZeroDivisionError: integer division or modulo by zero
What am I doing wrong?
I tried to debug the lib and found self.verbose for sklearn/externals/joblib/parallel.py is -1, however it's supposed to be at least 0 by default. So I think it's a bug.
I am working on a Windows 7 8gb RAM.
This is the vectorizer I am using to vectorize a free text column in my 52MB training dataset
vec = CountVectorizer(analyzer='word',stop_words='english',decode_error='ignore',binary=True)
I want to calculate 5 nearest neighbours with this dataset for an 18MB test set.
nbrs = NearestNeighbors(n_neighbors=5).fit(vec.transform(data['clean_sum']))
vectors = vec.transform(data_test['clean_sum'])
distances,indices = nbrs.kneighbors(vectors)
This is the stack trace -
Traceback (most recent call last):
File "cr_nearness.py", line 224, in <module>
distances,indices = nbrs.kneighbors(vectors)
File "C:\Anaconda2\lib\site-packages\sklearn\neighbors\base.py", line 371,
kneighbors
n_jobs=n_jobs, squared=True)
File "C:\Anaconda2\lib\site-packages\sklearn\metrics\pairwise.py", line 12
in pairwise_distances
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
File "C:\Anaconda2\lib\site-packages\sklearn\metrics\pairwise.py", line 10
in _parallel_pairwise
return func(X, Y, **kwds)
File "C:\Anaconda2\lib\site-packages\sklearn\metrics\pairwise.py", line 23
n euclidean_distances
distances = safe_sparse_dot(X, Y.T, dense_output=True)
File "C:\Anaconda2\lib\site-packages\sklearn\utils\extmath.py", line 181,
afe_sparse_dot
ret = ret.toarray()
File "C:\Anaconda2\lib\site-packages\scipy\sparse\compressed.py", line 940
toarray
return self.tocoo(copy=False).toarray(order=order, out=out)
File "C:\Anaconda2\lib\site-packages\scipy\sparse\coo.py", line 250, in to
y
B = self._process_toarray_args(order, out)
File "C:\Anaconda2\lib\site-packages\scipy\sparse\base.py", line 817, in _
ess_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError
Any ideas?
Use KNN with KD TREE
model =
KNeighborsClassifier(n_neighbors=5,algorithm='kd_tree').fit(X_train,
Y_train)
the model by default is algorithm='brute'. brute false take too much memory.
I think for your model it should be look like this
nbrs =
NearestNeighbors(n_neighbors=5,algorithm='kd_tree').fit(vec.transform(data['clean_sum']))