Question on Python Feature Selection (null values) - python

I'm building a logistic regression model, and want to understand the features that contribute the most to my output (1 or 0). Trying to understand if a customer comes back to my website, what features are making them come back. I am getting stuck with this fit function. It errors out on me and I can't figure out why. It seems to indicate that I have some null values, but I've scrubbed through my data and removed the null values.
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#load data
df = pd.read_csv('jupyter.csv', header = 0)
array = dataframe.values
X = array[:,1:13]
Y = array[:,14]
print (X.shape)
print (Y.shape)
(544219, 12)
(544219,)
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-63-f91db4d08897> in <module>
1 # feature extraction
2 test = SelectKBest(score_func=chi2, k=4)
----> 3 fit = test.fit(X, Y)
4 # summarize scores
5 #numpy.set_printoptions(precision=3)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_selection/univariate_selection.py in fit(self, X, y)
339 self : object
340 """
--> 341 X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True)
342
343 if not callable(self.score_func):
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
720 if multi_output:
721 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
--> 722 dtype=None)
723 else:
724 y = column_or_1d(y, warn=True)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
540 if force_all_finite:
541 _assert_all_finite(array,
--> 542 allow_nan=force_all_finite == 'allow-nan')
543
544 if ensure_min_samples > 0:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan)
58 elif X.dtype == np.dtype('object') and not allow_nan:
59 if _object_dtype_isnan(X).any():
---> 60 raise ValueError("Input contains NaN")
61
62
ValueError: Input contains NaN

You can impute missing values using:
SimpleImputer, or
using pandas fillna(). For example, the following code will impute all missing values with the mean:
X.fillna(X.mean())

Related

I am trying to fit my data for Linear Regression and getting this error [duplicate]

This question already has answers here:
How do I remove NaN values from a NumPy array?
(13 answers)
Closed 7 days ago.
> this is the error i am getting, i am new to python please help with this.
##########################################################################
ValueError Traceback (most recent call last)
Cell In[97], line 4
1 LR= LinearRegression()
3 #fit
----> 4 LR.fit(X,Y)
6 #predict
7 y_predict = LR.predict(X_test)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_base.py:649, in LinearRegression.fit(self, X, y, sample_weight)
645 n_jobs_ = self.n_jobs
647 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 649 X, y = self._validate_data(
650 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
651 )
653 sample_weight = _check_sample_weight(
654 sample_weight, X, dtype=X.dtype, only_non_negative=True
655 )
657 X, y, X_offset, y_offset, X_scale = _preprocess_data(
658 X,
659 y,
(...)
662 sample_weight=sample_weight,
663 )
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:554, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
552 y = check_array(y, input_name="y", **check_y_params)
553 else:
--> 554 X, y = check_X_y(X, y, **check_params)
555 out = X, y
557 if not no_val_X and check_params.get("ensure_2d", True):
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:1104, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1099 estimator_name = _check_estimator_name(estimator)
1100 raise ValueError(
1101 f"{estimator_name} requires y to be passed, but the target y is None"
1102 )
-> 1104 X = check_array(
1105 X,
1106 accept_sparse=accept_sparse,
1107 accept_large_sparse=accept_large_sparse,
1108 dtype=dtype,
1109 order=order,
1110 copy=copy,
1111 force_all_finite=force_all_finite,
1112 ensure_2d=ensure_2d,
1113 allow_nd=allow_nd,
1114 ensure_min_samples=ensure_min_samples,
1115 ensure_min_features=ensure_min_features,
1116 estimator=estimator,
1117 input_name="X",
1118 )
1120 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1122 check_consistent_length(X, y)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:919, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
913 raise ValueError(
914 "Found array with dim %d. %s expected <= 2."
915 % (array.ndim, estimator_name)
916 )
918 if force_all_finite:
--> 919 _assert_all_finite(
920 array,
921 input_name=input_name,
922 estimator_name=estimator_name,
923 allow_nan=force_all_finite == "allow-nan",
924 )
926 if ensure_min_samples > 0:
927 n_samples = _num_samples(array)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:111, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
109 if X.dtype == np.dtype("object") and not allow_nan:
110 if _object_dtype_isnan(X).any():
--> 111 raise ValueError("Input contains NaN")
113 # We need only consider float arrays, hence can early return for all else.
114 if X.dtype.kind not in "fc":
ValueError: Input contains NaN
###################################################################333
This is the code i am trying and getting the error above.
LR= LinearRegression(normalize=True)
#fit
LR.fit(X,Y)
#predict
y_predict = LR.predict(X_test)
it looks like the x and y values youre passing to the fit function are not properly formatted, which is causing the issue. specifically it seems like one of the values in x might be null. im not familiar with the modules you used but checking to make sure the x and y are set correctly is a good first step.
Use train_test_split,
LinearRegression simple code:
X = dataset[['statezip','city','bedrooms','sqft_living','sqft_lot','sqft_above','floors',]]
y = dataset['price']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn import linear_model
import numpy as np
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
y_pred_RMSE = regr.predict(X_test)

Sklearn fitting SVM with StandardScaler

please am fitting svr on my dataset and am getting this error message. it was working when I have not included standardscaler. I have tried all means but still not working.
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(np.array(y).reshape(1,-1))
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X,y)`
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-75416c35e495> in <module>
2 from sklearn.svm import SVR
3 regressor = SVR(kernel = 'rbf') # rbf means radial basis function
----> 4 regressor.fit(X,y)
C:\anconda\lib\site-packages\sklearn\svm\_base.py in fit(self, X, y, sample_weight)
146 X, y = check_X_y(X, y, dtype=np.float64,
147 order='C', accept_sparse='csr',
--> 148 accept_large_sparse=False)
149 y = self._validate_targets(y)
150
C:\anconda\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
758 dtype=None)
759 else:
--> 760 y = column_or_1d(y, warn=True)
761 _assert_all_finite(y)
762 if y_numeric and y.dtype.kind == 'O':
C:\anconda\lib\site-packages\sklearn\utils\validation.py in column_or_1d(y, warn)
795 return np.ravel(y)
796
--> 797 raise ValueError("bad input shape {0}".format(shape))
798
799
ValueError: bad input shape (1, 10)
You are feeding to the SVM a target vector with dimension (1,10) which means one row and ten columns, this is wrong and it's caused by you're using of reshaping in
y = sc_y.fit_transform(np.array(y).reshape(1,-1))
Please note that this line is also conceptually wrong, the standardised should be applied only on the training features, not on the target vector, so you can avoid to define
sc_y = StandardScaler()

Setting an array element with a sequence python - machine learning (logistic regression)

I am trying to run logistic regression on my y and x, however I keep getting the error 'Setting an array element with a sequence'. I think I might have to reshape my data, however I am not too sure what dimensions of array should be used. I tried reshaping k to (3, 1) and g to (4000000, 1) but it still did not work. I have attached my code below (without reshaping arrays). The data is a netcdf file. Appreciate if anyone can help, thank you.
Screenshot of final_df.head(5)
import pandas as pd
import geopandas as gpd
from netCDF4 import Dataset
from osgeo import gdal, ogr
f = Dataset('C:\\filename.nc', 'r')
#Extract pixel 'coords'
B01_DATA = f.variables['B01_DATA'][:]
B02_DATA = f.variables['B02_DATA'][:]
VIS_DATA = f.variables['VIS_DATA'][:]
#these are look-up tables
B01_LUT = f.variables['B01_LUT'][:]
B02_LUT = f.variables['B02_LUT'][:]
VIS_LUT = f.variables['VIS_LUT'][:]
min_lat = -15
min_lon = 90
res = 0.009 #resolution
import numpy as np
lst = []
for x in range(0, 2000):
for y in range(0,2000):
B01 = (B01_LUT[B01_DATA[x,y]])
B02 = (B02_LUT[B02_DATA[x,y]])
VIS = (VIS_LUT[VIS_DATA[x,y]])
k = np.array([B01,B02,VIS], dtype=np.float32)
lst.append(k)
df = pd.DataFrame()
df['x'] = lst
#print(df)
lst1 = []
lst2=[]
for x in range(0, 2000):
for y in range(0,2000):
lon = min_lat + x*res
lat = min_lon + y*res
lst1.append(lat)
lst2.append(lon)
df1 = pd.DataFrame()
df1['Latitude'] = lst1
df1['Longitude'] = lst2
df1['Coords'] = list(zip(df1.Latitude, df1.Longitude))
print(df1)
import shapefile
from shapely.geometry import shape, Point
# read your shapefile
r = shapefile.Reader("C:\\shapefile.shp")
# get the shapes
shapes = r.shapes()
# build a shapely polygon from your shape
hold = []
for k in range(20,22): #I am only taking a subset of layers in the polygon
polygon = shape(shapes[k])
for x in df1.Coords:
if polygon.contains(Point(x)):
hold.append(x)
#print(len(hold))
g = np.where(df1['Coords'].isin(hold), 1,0)
g.tolist()
df1['y'] = g
final_df = df.join(df1)
print(final_df)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X = final_df.X
y = final_df.y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
This is the full error message:
ValueError Traceback (most recent call last)
<ipython-input-12-f189af4819e6> in <module>()
2 from sklearn.linear_model import LogisticRegression
3 logmodel = LogisticRegression()
----> 4 logmodel.fit(X_train, y_train)
~\Anaconda2\envs\python3env\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1214
1215 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
-> 1216 order="C")
1217 check_classification_targets(y)
1218 self.classes_ = np.unique(y)
~\Anaconda2\envs\python3env\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
571 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
572 ensure_2d, allow_nd, ensure_min_samples,
--> 573 ensure_min_features, warn_on_dtype, estimator)
574 if multi_output:
575 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~\Anaconda2\envs\python3env\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: setting an array element with a sequence.
Looks to me like your error is caused by the fact that you have columns which contain lists which isn't a valid input format to a model. Try something like this (taken from here: Pandas split column of lists into multiple columns):
X = pd.DataFrame(final_df.X.values.tolist(), columns=['x1','x2','x3'])
This should return a three column dataframe with your co-ordinates

Unpack Dictionaries for Logistic Regression in Python

I'm trying to run some sentiment analysis on product reviews, and I'm getting tripped up with getting my model to read the word count dictionaries
import pandas as pd
import numpy as np
from sklearn import linear_model, model_selection, metrics
products = pd.read_csv('data.csv')
def count_words(s):
d = {}
wl = str(s).split()
for w in wl:
d[w] = wl.count(w)
return d
products['word_count'] = products['review'].apply(count_words)
products = products[products['rating'] != 3]
products['sentiment'] = (products['rating'] >= 4) * 1
train_data, test_data = model_selection.train_test_split(products, test_size = 0.2, random_state=0)
sentiment_model = linear_model.LogisticRegression()
sentiment_model.fit(X = train_data['word_count'], y =train_data['sentiment'])
When I run that last line I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-51-0c3f47af3a6e> in <module>()
----> 1 sentiment_model.fit(X = train_data['word_count'], y =
train_data['sentiment'])
C:\ProgramData\anaconda_3\lib\site-packages\sklearn\linear_model\logistic.py
in fit(self, X, y, sample_weight)
1171
1172 X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-> 1173 order="C")
1174 check_classification_targets(y)
1175 self.classes_ = np.unique(y)
C:\ProgramData\anaconda_3\lib\site-packages\sklearn\utils\validation.py in
check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
519 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
520 ensure_2d, allow_nd, ensure_min_samples,
--> 521 ensure_min_features, warn_on_dtype, estimator)
522 if multi_output:
523 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
C:\ProgramData\anaconda_3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
TypeError: float() argument must be a string or a number, not 'dict'
It seems like the model is pulling the dictionaries as the x variables instead of the entries in the dictionaries. I think I need to unpack the dictionaries into arrays (?) but haven't had any luck doing so.
update:
Here is that products looks like after running word_count and defining sentiment
products.head()
If you want to just correct the error, first use DictVectorizer on the train_data['word_count'] to convert it into acceptable format which is an array of shape [n_samples, n_features].
Add the below to your code before sentiment_model.fit():
from sklearn.feature_extraction import DictVectorizer
dictVectorizer = DictVectorizer()
train_data_dict = dictVectorizer.fit_transform(train_data['word_count'])
Then call sentiment_model.fit() like this:
sentiment_model.fit(X = train_data_dict, y =train_data['sentiment'])
Note:-
And instead of implementing your own count words method, I would recommend you to use CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer
countVec = CountVectorizer()
train_data_vectorizer = countVec.fit_transform(train_data['review'])
sentiment_model.fit(X = train_data_vectorizer, y =train_data['sentiment'])
Try
X = train_data['word_count'].values()
This should return a list of word counts (numbers) for each item in train_data['word_count'] if that is what you are looking for.

Losing the header of a csv file after normlizing

I've wrote the following code to read a csv file run a column wise normalization :
from sklearn import preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# reading Train values
Training ='Training.csv'
df = pd.read_csv(Training)
df =df.drop(df.columns[len(df.loc[1])-1],axis=1)
df =df.drop(df.columns[len(df.loc[1])-1],axis=1)
df.describe()
minmax_scaler= preprocessing.MinMaxScaler()
np_scaled = minmax_scaler.fit_transform(df)
normalized = pd.DataFrame(np_scaled)
normalized.describe()
np.shape(df)
np.shape(normalized)
My question is why I can't see the headers in the normalized list? despite it having the same shape of df, I've tried to read the csv file without a header but the program crashes ?
..............................
df = pd.read_csv(Training,header=None)
.........................
delivers the following :
ValueError Traceback (most recent call last)
<ipython-input-15-dd18ba2a6204> in <module>()
14 df.describe()
15 minmax_scaler= preprocessing.MinMaxScaler()
---> 16 np_scaled = minmax_scaler.fit_transform(df)
17 normalized = pd.DataFrame(np_scaled)
18 normalized.describe()
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
492 if y is None:
493 # fit method of arity 1 (unsupervised transformation)
--> 494 return self.fit(X, **fit_params).transform(X)
495 else:
496 # fit method of arity 2 (supervised transformation)
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\preprocessing\data.py in fit(self, X, y)
290 # Reset internal state before fitting
291 self._reset()
--> 292 return self.partial_fit(X, y)
293
294 def partial_fit(self, X, y=None):
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\preprocessing\data.py in partial_fit(self, X, y)
316
317 X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True,
--> 318 estimator=self, dtype=FLOAT_DTYPES)
319
320 if X.ndim == 1:
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
ValueError: could not convert string to float: 'Feature458'
I'll greatful for any hint about how may I solve this !
Well, that is because you use preprocessing.MinMaxScaler() which returns an array, not a dataframe.
After you create a dataframe based on this matrix, it does not know anything about your columns.
You could try something like
normalized = pd.DataFrame(np_scaled, columns=df.columns)
And with the latter example (with header=False), you simply have your header as the first row. When sklearn tries to convert a column name into an integer, you get your error.

Categories

Resources