I'm constructing an ANN in python and I have a trouble encoding column[-1] (y) into binary numbers.
There are 6 different parameters in this column and I want to encode each one into a separate column, like done in columns of X with onehotencoder,
Thanks
Ido
dataframe_screenshot
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import confusion_matrix,accuracy_score,mean_squared_error,r2_score
import tensorflow as tf
from tensorflow import keras
Political_opinions = pd.read_csv("data.csv")
Political_opinions.drop(columns=['Timestamp','Yas','Bolge','Egitim'],axis=1,inplace=True)
print(Political_opinions)
one_hot_color = pd.get_dummies(Political_opinions.parti).values
print(Political_opinions.head(10))
Political_opinions["Cinsiyet"] = (Political_opinions["Cinsiyet"]=="Erkek").astype(int)
Political_opinions["soru1"] = (Political_opinions["soru1"]=="Hayır").astype(int)
Political_opinions["soru2"] = (Political_opinions["soru2"]=="Hayır").astype(int)
Political_opinions["soru3"] = (Political_opinions["soru3"]=="Hayır").astype(int)
Political_opinions["soru4"] = (Political_opinions["soru4"]=="Hayır").astype(int)
Political_opinions["soru5"] = (Political_opinions["soru5"]=="Hayır").astype(int)
Political_opinions["soru6"] = (Political_opinions["soru6"]=="Hayır").astype(int)
Political_opinions["soru7"] = (Political_opinions["soru7"]=="Hayır").astype(int)
Political_opinions["soru8"] = (Political_opinions["soru8"]=="Hayır").astype(int)
Political_opinions["soru9"] = (Political_opinions["soru9"]=="Hayır").astype(int)
Political_opinions["soru10"] = (Political_opinions["soru10"]=="Hayır").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="AKP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="MHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="CHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="DIĞER").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="HDP").astype(int)
Related
My input dataset is 9900 data with 4 inputs and 3 outputs
import numpy as np
import matplotlib.pyplot as plt
import gpflow as gpf
import tensorflow as tf
from gpflow.utilities import print_summary
from gpflow.ci_utils import ci_niter
import pandas as pd
MAXITER = ci_niter(1000)
###Using LMC into GPR
# kernel=gpf.kernels.Matern32(lengthscales=[1,1,1,1])
kern_list = [gpf.kernels.Matern32(lengthscales=[1,1,1,1]) for _ in range(3)]
kernel = gpf.kernels.LinearCoregionalization(
kern_list, W=np.random.randn(3,3)
)
m = gpf.models.GPR(data=(X, Y), kernel=kernel, mean_function=None)
When I try to optimise it
error image
Below is a sample of the data set that I am using:
id,product,store,revenue,store_capacity,state
1,Ball,AB,222,1000,CA
1,Pen,AB,234,1452,WD
2,Books,CD,543,888,MA
2,Ink,EF,123,9865,NY
Code is below
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
sns.set(rc={'figure.figsize':(11.7,8.27)})
df = pd.read_csv(r'1.csv',index_col=None)
dummies = pd.get_dummies(data = df)
km = KMeans(n_clusters=2).fit(dummies)
labels = km.predict(dummies)
dummies['cluster_id'] = km.labels_
def distance_to_centroid(row, centroid):
row = row[['id', 'product', 'store', 'revenue','store_capacity', 'state_AL', 'state_CA', 'state_CH',
'state_WD', 'country_India', 'country_Japan', 'country_USA']]
return euclidean(row, centroid)
dummies['distance_to_center0'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[0]),1)
dummies['distance_to_center1'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[1]),1)
dummies['distance_to_center2'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[2]),1)
dummies_df = dummies[['distance_to_center0','distance_to_center1','cluster_id']]
test = {0:"Blue", 1:"Red", 2:"Green"}
sns.scatterplot(x="distance_to_center0", y="distance_to_center1", data=dummies_df, hue="cluster_id", palette = test)
I need to get the centre of each cluster, Code below is getting centroid of each element means what is the distance from each element to the centre point of the cluster
centroids = km.cluster_centers_
centroid_labels = [centroids[i] for i in labels]
centroid_label
I want to get the centre point of each cluster
courtesy #Isma
km = KMeans(n_clusters=7).fit(dummies)
closest, _ = pairwise_distances_argmin_min(km.cluster_centers_, dummies)
closest
I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.
Hello i'm using univariate_selection method for selecting a best features from a following data set:
https://i.stack.imgur.com/J31T0.png
But i got an error
Value Error: could not convert string to float: 'SUDMyYggegA'
Below is my code:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data = pd.read_csv("C://Users/Shahnawaz Irfan/Desktop/demo.csv")
X = data.iloc[:,0:15]
y = data.iloc[:,-13]
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']
print(featureScores.nlargest(15,'Score')) `
I want to calculate the variance of features saved in a Train and Test file a followed :
col1 Feature0 Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9
col2 26658 40253.5 3.22115e+09 0.0277727 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
col3 2658 4053.5 3.25e+09 0.0277 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
....
for that I've wrote the following :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
sel = VarianceThreshold(threshold=(.2 * (1 - .2)))
sel.fit_transform(np_scaled_train)
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
This obviously doesn't work. the result in variance_result.csv is just the train matrix normalized.
So my question how can I get the index of the columns(features) that have a variance bellow 20%.
thanks in advance !
Update
I've solved the variance issue this way :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.feature_selection import VarianceThreshold
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
variance =pd_scaled_train.apply(np.var,axis=0)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
temp_df = pd.DataFrame(variance.values,Training_Frame.columns.values[:-2])
temp_df.T.to_csv('Training_features_variance.csv',index=False)
No I still don't know how to get indeces of features with a variance say bigger than 0.2 from variance other thanks running a loop!
Just set the threshold to 0.0 and then use the variances_ attribute of the VarianceThreshold object to get the variances of all your features, then you can identify which of them have lower variance.
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_
#Output: array([ 0. , 0.22222222, 2.88888889, 0. ])