Error during data scaling in pandas data fram - python

I have a dataset in CSV format. I am trying to perform scaling in my dataset, but I am getting an error. As I understood, I need to convert from 3D to 2D. But I am not sure, how to do that.
Example of my dataset:
63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1
57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0
56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2
44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
57.0,1.0,3.0,150.0,168.0,0.0,0.0,174.0,0.0,1.6,1.0,0.0,3.0,0
48.0,1.0,2.0,110.0,229.0,0.0,0.0,168.0,0.0,1.0,3.0,0.0,7.0,1
54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0
My code:
import pandas as pd
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('processed_cleveland_data.csv')
ss = StandardScaler()
df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
Error:
ValueError
Traceback (most recent call last)
<ipython-input-5-6db223ceefcd> in <module>
4 df = pd.read_csv('processed_cleveland_data.csv')
5 ss = StandardScaler()
----> 6 df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
~\Miniconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
697 if y is None:
698 # fit method of arity 1 (unsupervised transformation)
--> 699 return self.fit(X, **fit_params).transform(X)
700 else:
701 # fit method of arity 2 (supervised transformation)
~\Miniconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~\Miniconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~\Miniconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
614 array = array.astype(dtype, casting="unsafe", copy=False)
615 else:
--> 616 array = np.asarray(array, order=order, dtype=dtype)
617 except ComplexWarning as complex_warning:
618 raise ValueError("Complex data not supported\n"
~\Miniconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
~\Miniconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
1897
1898 def __array__(self, dtype=None) -> np.ndarray:
-> 1899 return np.asarray(self._values, dtype=dtype)
1900
1901 def __array_wrap__(
~\Miniconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: '?'

Use na_values for convert ? to missing values:
df = pd.read_csv('processed_cleveland_data.csv', na_values='?')
#if csv has no header
#df = pd.read_csv('processed_cleveland_data.csv', na_values='?', header=None)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)

Related

ValueError: Input contains NaN, ... when doing fit_transform() in BERTopic

I want to make BERTopic model with my clustering algorithm (KMeans) and my Vectorizer (Count Vectorizer), but I keep getting this warning and error when I want to do .fit_transform(data) :
Warining:
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/vectorizers/_ctfidf.py:69: RuntimeWarning:
divide by zero encountered in divide
And then, error:
ValueError Traceback (most recent call last)
<ipython-input-104-1f024d22018f> in <module>
----> 1 topics, probs = bert_topic_model.fit_transform(final_df.body)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/_bertopic.py in fit_transform(self, documents, embeddings, y)
368 self._map_representative_docs(original_topics=True)
369 else:
--> 370 self._save_representative_docs(documents)
371
372 self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/bertopic/_bertopic.py in _save_representative_docs(self, documents)
3000 bow = self.vectorizer_model.transform(selected_docs)
3001 ctfidf = self.ctfidf_model.transform(bow)
-> 3002 sim_matrix = cosine_similarity(ctfidf, self.c_tf_idf_[topic + self._outliers])
3003
3004 # Extract top 3 most representative documents
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/metrics/pairwise.py in cosine_similarity(X, Y, dense_output)
1178 # to avoid recursive import
1179
-> 1180 X, Y = check_pairwise_arrays(X, Y)
1181
1182 X_normalized = normalize(X, copy=True)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/metrics/pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype, accept_sparse, force_all_finite, copy)
144 estimator=estimator)
145 else:
--> 146 X = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
147 copy=copy, force_all_finite=force_all_finite,
148 estimator=estimator)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
648 if sp.issparse(array):
649 _ensure_no_complex_data(array)
--> 650 array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
651 dtype=dtype, copy=copy,
652 force_all_finite=force_all_finite,
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse)
446 % spmatrix.format, stacklevel=2)
447 else:
--> 448 _assert_all_finite(spmatrix.data,
449 allow_nan=force_all_finite == 'allow-nan')
450
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
This is my full code:
features = final_df["body"] # does not have NaN or Infinite values, I have checked 10 times
transformerVectoriser = CountVectorizer(analyzer = 'word', ngram_range = (1, 4), vocabulary = vocab_list)
#my vocab list does not have NaN or Infinite values, I have checked 10 times
cluster_model = KMeans(n_clusters = 50, init='k-means++', max_iter = 1500, random_state=None)
bert_topic_model = BERTopic(hdbscan_model = cluster_model,
vectorizer_model = transformerVectoriser,
verbose = True,
top_n_words = 15)
#final_df.body does not have NaN or Infinite values, I have checked 10 times
topics, probs = bert_topic_model.fit_transform(final_df.body) #ERROR
I really do not know what is the problem, and what is going on.
All values in vocab_list are string values and all values in final_df.body are string values

scipy sparse matrix negative column index found

I am trying to run the PolynomialFeatures from sklearn with a large sparse matrix as input:
[1] x
>>> <11967295x120006 sparse matrix of type '<class 'numpy.int64'>'
with 55375058 stored elements in Compressed Sparse Row format>
[2] from sklearn.preprocessing import PolynomialFeatures
[3] pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)
[4] xinter = pf.fit_transform(x)
And got the error ValueError: negative column index found:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-78-dc5dc18d59d2> in <module>
1 start = time.time()
----> 2 xinter = pf.fit_transform(x)
3 end = time.time()
/venv/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
/venv/lib/python3.6/site-packages/sklearn/preprocessing/data.py in transform(self, X)
1523 break
1524 to_stack.append(Xp_next)
-> 1525 XP = sparse.hstack(to_stack, format='csr')
1526 elif sparse.isspmatrix_csc(X) and self.degree < 4:
1527 return self.transform(X.tocsr()).tocsc()
/venv/lib/python3.6/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/venv/lib/python3.6/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
572 for j in range(N):
573 if blocks[i,j] is not None:
--> 574 A = coo_matrix(blocks[i,j])
575 blocks[i,j] = A
576 block_mask[i,j] = True
/venv/lib/python3.6/site-packages/scipy/sparse/coo.py in __init__(self, arg1, shape, dtype, copy)
170 self._shape = check_shape(arg1.shape)
171 else:
--> 172 coo = arg1.tocoo()
173 self.row = coo.row
174 self.col = coo.col
/venv/lib/python3.6/site-packages/scipy/sparse/compressed.py in tocoo(self, copy)
1015 from .coo import coo_matrix
1016 return coo_matrix((self.data, (row, col)), self.shape, copy=copy,
-> 1017 dtype=self.dtype)
1018
1019 tocoo.__doc__ = spmatrix.tocoo.__doc__
/venv/lib/python3.6/site-packages/scipy/sparse/coo.py in __init__(self, arg1, shape, dtype, copy)
196 self.data = self.data.astype(dtype, copy=False)
197
--> 198 self._check()
199
200 def reshape(self, *args, **kwargs):
/venv/lib/python3.6/site-packages/scipy/sparse/coo.py in _check(self)
289 raise ValueError('negative row index found')
290 if self.col.min() < 0:
--> 291 raise ValueError('negative column index found')
292
293 def transpose(self, axes=None, copy=False):
ValueError: negative column index found
This does not look right because the col index type is int64:
> /venv/lib/python3.6/site-packages/scipy/sparse/coo.py(291)_check()
289 raise ValueError('negative row index found')
290 if self.col.min() < 0:
--> 291 raise ValueError('negative column index found')
292
293 def transpose(self, axes=None, copy=False):
ipdb> self.col.max()
2147482788
ipdb> self.col.dtype
dtype('int64')
ipdb> self.col.min()
-2147480639
I am using the following versions:
scipy.__version__
'1.3.1'
sklearn.__version__
'0.21.2'
I appreciate any help to figure out this issue!

min_max_scaler.fit_transform: ValueError: setting an array element with a sequence

I'm using min_max_scaler.fit_transform() to rescale each column in the Dataframe - df.
df[['A', 'B', 'C']] = min_max_scaler.fit_transform(df[['A', 'B', 'C']])
I got the ValueError: setting an array element with a sequence. However, this error only occurs when I process one of my csv files. All others work fine. I don't know where should I start to debug? Can anyone suggest some directions to figure out the issue?
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
569 if y is None:
570 # fit method of arity 1 (unsupervised transformation)
--> 571 return self.fit(X, **fit_params).transform(X)
572 else:
573 # fit method of arity 2 (supervised transformation)
~/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_data.py in fit(self, X, y)
337 # Reset internal state before fitting
338 self._reset()
--> 339 return self.partial_fit(X, y)
340
341 def partial_fit(self, X, y=None):
~/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y)
371 X = check_array(X,
372 estimator=self, dtype=FLOAT_DTYPES,
--> 373 force_all_finite="allow-nan")
374
375 data_min = np.nanmin(X, axis=0)
~/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
529 array = array.astype(dtype, casting="unsafe", copy=False)
530 else:
--> 531 array = np.asarray(array, order=order, dtype=dtype)
532 except ComplexWarning:
533 raise ValueError("Complex data not supported\n"
~/anaconda3/lib/python3.7/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.

JuPyter: Creating Decision Tree, TypeError: '<' not supported between instances of 'str' and 'float''

I'm creating decision tree with JuPyter notebook and when I started creating the decision and putting the features and target class the jupyter give me this error that is found in this cell for the decision tree.dt = c.fit(X_train, y_train)
This is the error.
TypeError
Traceback (most recent call last)
<ipython-input-10-0f9186b7935c> in <module>()
----> 1 dt = c.fit(X_train, y_train)
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
788 sample_weight=sample_weight,
789 check_input=check_input,
--> 790 X_idx_sorted=X_idx_sorted)
791 return self
792
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
138
139 if is_classification:
--> 140 check_classification_targets(y)
141 y = np.copy(y)
142
~\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
167 y : array-like
168 """
--> 169 y_type = type_of_target(y)
170 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
171 'multilabel-indicator', 'multilabel-sequences']:
~\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
286 return 'continuous' + suffix
287
--> 288 if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
289 return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
290 else:
~\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
221 ar = np.asanyarray(ar)
222 if axis is None:
--> 223 return _unique1d(ar, return_index, return_inverse, return_counts)
224 if not (-ar.ndim <= axis < ar.ndim):
225 raise ValueError('Invalid axis kwarg specified for unique')
~\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
281 aux = ar[perm]
282 else:
--> 283 ar.sort()
284 aux = ar
285 flag = np.concatenate(([True], aux[1:] != aux[:-1]))
TypeError: '<' not supported between instances of 'str' and 'float'
I'm confused because my data set feature is clean it is all Int and the target class in the only categorical.
Can someone tell me what is happening and what to do so that dt = c.fit(X_train, y_train) will work? c = DecisionTreeClassifier(min_samples_split=100) c is a decision tree classifier.
This is my sample data set:

Python shapes not aligned

I have searched Google for help on this error and tried using dummies, but so far nothing has helped. My code is:
df_movies_train_reg = df_movies[df_movies['Release Year'] < 2014]
df_movies_test_reg = df_movies[df_movies['Release Year'] > 2013]
df_train = pd.DataFrame()
df_test = pd.DataFrame()
df_train['x'] = df_movies_train_reg['Theaters in First Week'].copy()
df_train['y'] = df_movies_train_reg['Worldwide Gross'].copy()
df_test['x'] = df_movies_test_reg['Theaters in First Week'].copy()
df_test['y'] = df_movies_test_reg['Worldwide Gross'].copy()
df_train['x^2'] = df_train['x'] * df_train['x']
X_reg = df_train[['x', 'x^2']]
y_reg = df_train[['y']]
reg = LinearRegression()
model = reg.fit(X_reg, y_reg)
score = model.score(df_test[['x']], df_test[['y']])
The dataset is a list of 2328 rows, the training set that is left after the filter by year is just 2037 rows, the test set is then 291 rows.
ValueError Traceback (most recent call last)
<ipython-input-116-aee4ec5bd647> in <module>
50 model = reg.fit(X_reg, y_reg)
51
---> 52 score = model.score(df_test[['x']], df_test[['y']])
53
54
~\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
326
327 from .metrics import r2_score
--> 328 return r2_score(y, self.predict(X), sample_weight=sample_weight,
329 multioutput='variance_weighted')
330
~\Anaconda3\lib\site-packages\sklearn\linear_model\base.py in predict(self, X)
211 Returns predicted values.
212 """
--> 213 return self._decision_function(X)
214
215 _preprocess_data = staticmethod(_preprocess_data)
~\Anaconda3\lib\site-packages\sklearn\linear_model\base.py in _decision_function(self, X)
196 X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
197 return safe_sparse_dot(X, self.coef_.T,
--> 198 dense_output=True) + self.intercept_
199
200 def predict(self, X):
~\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
171 return ret
172 else:
--> 173 return np.dot(a, b)
174
175
ValueError: shapes (291,1) and (2,1) not aligned: 1 (dim 1) != 2 (dim 0)

Categories

Resources