ValueError: could not convert string to float in univariate_selection - python

Hello i'm using univariate_selection method for selecting a best features from a following data set:
https://i.stack.imgur.com/J31T0.png
But i got an error
Value Error: could not convert string to float: 'SUDMyYggegA'
Below is my code:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data = pd.read_csv("C://Users/Shahnawaz Irfan/Desktop/demo.csv")
X = data.iloc[:,0:15]
y = data.iloc[:,-13]
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']
print(featureScores.nlargest(15,'Score')) `

Related

Encoding target column

I'm constructing an ANN in python and I have a trouble encoding column[-1] (y) into binary numbers.
There are 6 different parameters in this column and I want to encode each one into a separate column, like done in columns of X with onehotencoder,
Thanks
Ido
dataframe_screenshot
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import confusion_matrix,accuracy_score,mean_squared_error,r2_score
import tensorflow as tf
from tensorflow import keras
Political_opinions = pd.read_csv("data.csv")
Political_opinions.drop(columns=['Timestamp','Yas','Bolge','Egitim'],axis=1,inplace=True)
print(Political_opinions)
one_hot_color = pd.get_dummies(Political_opinions.parti).values
print(Political_opinions.head(10))
Political_opinions["Cinsiyet"] = (Political_opinions["Cinsiyet"]=="Erkek").astype(int)
Political_opinions["soru1"] = (Political_opinions["soru1"]=="Hayır").astype(int)
Political_opinions["soru2"] = (Political_opinions["soru2"]=="Hayır").astype(int)
Political_opinions["soru3"] = (Political_opinions["soru3"]=="Hayır").astype(int)
Political_opinions["soru4"] = (Political_opinions["soru4"]=="Hayır").astype(int)
Political_opinions["soru5"] = (Political_opinions["soru5"]=="Hayır").astype(int)
Political_opinions["soru6"] = (Political_opinions["soru6"]=="Hayır").astype(int)
Political_opinions["soru7"] = (Political_opinions["soru7"]=="Hayır").astype(int)
Political_opinions["soru8"] = (Political_opinions["soru8"]=="Hayır").astype(int)
Political_opinions["soru9"] = (Political_opinions["soru9"]=="Hayır").astype(int)
Political_opinions["soru10"] = (Political_opinions["soru10"]=="Hayır").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="AKP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="MHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="CHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="DIĞER").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="HDP").astype(int)

creating a dataset function using scikit-learn

so I am pretty new at Python, and I am trying to load a dataset from my computer using scikit. This is what my code looks like:
**whatever.py**
import numpy as np
import csv
from sklearn.datasets.base import Bunch
class Cortex_nuc:
def cortex_nuclear():
with open('C:/Users/User/Desktop/Data_Cortex_Nuclear4.csv') as csv_file:
data_file = csv.reader(csv_file)
temp = next(data_file)
n_samples = int(float(temp[0]))
n_features = int(float(temp[1]))
data = np.empty((n_samples, n_features))
target = np.empty((n_samples,), dtype=np.float64)
for i, sample in enumerate(data_file):
data[i] = np.asarray(sample[:-1], dtype=np.float64)
target[i] = np.asarray(sample[-1], dtype=np.float64)
return Bunch(data=data, target=target)
so then I import it into my project:
from whatever import Cortex_nuc
and after that I try to save it into df:
df = Cortex_nuc.cortex_nuclear()
Btw, this is what the dataset looks like:
this is just a part of the dataset, otherwise it has 77 columns and about a thousand rows.
But I get an error message and I can't seem to figure out why it's happening. Here's the error message:
IndexError Traceback (most recent call last)
<ipython-input-5-a4935f2c187f> in <module>
----> 1 df = Cortex_nuc.cortex_nuclear()
~\whatever.py in cortex_nuclear()
20
21 for i, sample in enumerate(data_file):
---> 22 data[i] = np.asarray(sample[:-1], dtype=np.float64)
23 target[i] = np.asarray(sample[-1], dtype=np.float64)
24
IndexError: index 0 is out of bounds for axis 0 with size 0
Can someone please help me? Thanks!
If you want to create a "sklearn-like" dataset in a Bunch object, you probably want something like this:
import pandas as pd
import numpy as np
from sklearn.utils import Bunch
# For reproducing
from io import StringIO
csv_file = StringIO("""
target,A,B
0,0,0
1,0,1
1,1,0
0,1,1
""")
def load_xor(*, return_X_y=False):
"""Describe your data here."""
_data_file = pd.read_csv(csv_file)
_data = Bunch()
_data["DESCR"] = load_xor.__doc__
_data["data"] = _data_file[["A", "B"]].to_numpy(dtype=np.float64)
_data["target"] = _data_file["target"].to_numpy(dtype=np.float64)
_data["target_names"] = np.array(["false", "true"])
_data["feature_names"] = np.array(list(_data_file.drop(["target"], axis=1)))
if return_X_y:
return _data.data, _data.target
return _data
if __name__ == "__main__":
# Return and unpack the `X`, `y` tuple
X, y = load_xor(return_X_y=True)
print(X, y)
This is because sklearn.datasets typically return Bunch objects with specific attributes/keys (for explanations, see the "Return" section of the load_iris documentation):
>>> from sklearn.datasets import load_iris
>>> data = load_iris()
>>> dir(data)
['DESCR', 'data', 'feature_names', 'filename', 'frame', 'target', 'target_names']

VIF function returns all 'inf' values

I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.

how can i fix the 'syntax error' in python using jupyter notebook?

hi tried to fix the error but i could not and i dont know where im going wrong can anyone please help . below is my code
my previous error was indentation error
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn as s
import matplotlib
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from IPython.display import display
df = pd.read_csv("C:/Users/patel/Desktop/tap.csv")
from IPython.display import display
X_all = df.drop(['FTR'],1)
y_all = df['FTR']
# Standardising the data.
from sklearn.preprocessing import scale
#Center to the mean and component wise scale to unit variance.
cols = [['FTHG','FTAG','HTHG','HTAG']]
for col in cols:
X_all[col] = scale(X_all[col])
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')
def preprocess_features(X):
output = pd.DataFrame(index = X.index)
for col, col_df in X.iteritems():
if col_df.dtype == object:
col_df = pd.get_dummies(col_df, prefix = col)
output = output.join(col_df)
return output
X_all = preprocess_features(X_all)
print "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))
print "\nFeature values:"
display (X_all)
File "", line 39
print "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))
^
SyntaxError: invalid syntax
If you are using Python 3, then the parentheses in print function are missing. The following code should work.
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Getting the variance of each column in pandas

I want to calculate the variance of features saved in a Train and Test file a followed :
col1 Feature0 Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9
col2 26658 40253.5 3.22115e+09 0.0277727 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
col3 2658 4053.5 3.25e+09 0.0277 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
....
for that I've wrote the following :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
sel = VarianceThreshold(threshold=(.2 * (1 - .2)))
sel.fit_transform(np_scaled_train)
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
This obviously doesn't work. the result in variance_result.csv is just the train matrix normalized.
So my question how can I get the index of the columns(features) that have a variance bellow 20%.
thanks in advance !
Update
I've solved the variance issue this way :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.feature_selection import VarianceThreshold
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
variance =pd_scaled_train.apply(np.var,axis=0)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
temp_df = pd.DataFrame(variance.values,Training_Frame.columns.values[:-2])
temp_df.T.to_csv('Training_features_variance.csv',index=False)
No I still don't know how to get indeces of features with a variance say bigger than 0.2 from variance other thanks running a loop!
Just set the threshold to 0.0 and then use the variances_ attribute of the VarianceThreshold object to get the variances of all your features, then you can identify which of them have lower variance.
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_
#Output: array([ 0. , 0.22222222, 2.88888889, 0. ])

Categories

Resources