I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.
Related
Using the iris dataset as a hypothetical hello world example:
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
print(df.head())
Let us say I want to use tf.keras.layers.Embedding instead of one-hot/dummy encoding as part of ANN for regression. e.g.:
iris_class_name + sepalwidthcm + petallengthcm -> sepallengthcm
where sepallengthcm is the dependent variable. I came across this:
city_lookup = tf.keras.layers.StringLookup(vocabulary = city_vocabulary, mask_token = None);
city_embedding= tf.keras.Sequential([
city_lookup,
tf.keras.layers.Embedding(len(city_vocabulary) + 1, embedding_dimension)
], "city_embedding")
city = features["city"]
city_embedding_output = city_embedding(city)
but am not sure how to exactly use it in my use case. Any pointers very much welcome. Thanks!
You can map iris_class_name to n-dimensional vector representations and then concatenate with the other continuous features:
import pandas as pd
from sklearn import datasets
import numpy as np
import tensorflow as tf
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
vocab = df['iris_class_name'].unique()
embedding_dimension = 10
lookup = tf.keras.layers.StringLookup(vocabulary = vocab, mask_token = None)
embedding= tf.keras.Sequential([
lookup,
tf.keras.layers.Embedding(len(vocab) + 1, embedding_dimension)
])
names = df['iris_class_name'].to_numpy()
embedding_output = embedding(names)
features = np.concatenate((embedding_output, df[['sepalwidthcm', 'petallengthcm']].to_numpy()), axis=-1)
print(features.shape)
(150, 12)
Since you have 3 unique iris class names, you could also simply create an integer-to-vector dictionary manually, but it is up to you.
I need to cluster data using the Fuzzy C-Means. So, I use fcm from pyclustering.cluster.fcm. So, I would like to know if there is a way to get the labels.
import numpy as np
import pandas as pd
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
import random
coords = [(random.random()*2.0, random.random()*2.0) for _ in range(100)]
dfcluster = pd.DataFrame(coords, columns = ['x','y'])
sample = dfcluster.to_numpy()
# initialize
initial_centers = kmeans_plusplus_initializer(sample, 5, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
# create instance of Fuzzy C-Means algorithm
fcm_instance = fcm(sample, initial_centers)
# run cluster analysis and obtain results
fcm_instance.process()
clusters = fcm_instance.get_clusters()
print(clusters)
I have tried it this way, and it works, but I do not think that it is a perfect answer
import pandas as pd
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
import random
coords = [(random.random()*2.0, random.random()*2.0) for _ in range(100)]
dfcluster = pd.DataFrame(coords, columns = ['x','y'])
sample = dfcluster.to_numpy()
# initialize
initial_centers = kmeans_plusplus_initializer(sample, 5, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
# create instance of Fuzzy C-Means algorithm
fcm_instance = fcm(sample, initial_centers)
# run cluster analysis and obtain results
fcm_instance.process()
clusters = fcm_instance.get_clusters()
cluster=0
dfclusternew = pd.DataFrame(columns = ['cluster','x', 'y'])
for index, i in enumerate(clusters):
for j in i:
dfclusternew = dfclusternew.append(
pd.Series([cluster, dfcluster['x'].iloc[j], dfcluster['y'].iloc[j]], index=['cluster', 'x', 'y']),
ignore_index=True)
cluster += 1
dfcluster =dfclusternew
print(dfcluster)
However, I think I have another way to do that, and it is faster. As the result is the index in every cluster. So, I used loc[df.index[results[i]]
import pandas as pd
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm
import random
coords = [(random.random()*2.0, random.random()*2.0) for _ in range(100)]
dfcluster = pd.DataFrame(coords, columns = ['x','y'])
dfcluster['cluster'] = 0
sample = dfcluster.to_numpy()
# initialize
initial_centers = kmeans_plusplus_initializer(sample, 5, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
# create instance of Fuzzy C-Means algorithm
fcm_instance = fcm(sample, initial_centers)
# run cluster analysis and obtain results
fcm_instance.process()
dfcluster.reset_index()
results=fcm_instance.get_clusters()
for i in range(len(results)):
dfcluster.loc[dfcluster.index[results[i]], 'cluster'] = i
print(dfcluster)
Below is a sample of the data set that I am using:
id,product,store,revenue,store_capacity,state
1,Ball,AB,222,1000,CA
1,Pen,AB,234,1452,WD
2,Books,CD,543,888,MA
2,Ink,EF,123,9865,NY
Code is below
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
sns.set(rc={'figure.figsize':(11.7,8.27)})
df = pd.read_csv(r'1.csv',index_col=None)
dummies = pd.get_dummies(data = df)
km = KMeans(n_clusters=2).fit(dummies)
labels = km.predict(dummies)
dummies['cluster_id'] = km.labels_
def distance_to_centroid(row, centroid):
row = row[['id', 'product', 'store', 'revenue','store_capacity', 'state_AL', 'state_CA', 'state_CH',
'state_WD', 'country_India', 'country_Japan', 'country_USA']]
return euclidean(row, centroid)
dummies['distance_to_center0'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[0]),1)
dummies['distance_to_center1'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[1]),1)
dummies['distance_to_center2'] = dummies.apply(lambda r: distance_to_centroid(r,
km.cluster_centers_[2]),1)
dummies_df = dummies[['distance_to_center0','distance_to_center1','cluster_id']]
test = {0:"Blue", 1:"Red", 2:"Green"}
sns.scatterplot(x="distance_to_center0", y="distance_to_center1", data=dummies_df, hue="cluster_id", palette = test)
I need to get the centre of each cluster, Code below is getting centroid of each element means what is the distance from each element to the centre point of the cluster
centroids = km.cluster_centers_
centroid_labels = [centroids[i] for i in labels]
centroid_label
I want to get the centre point of each cluster
courtesy #Isma
km = KMeans(n_clusters=7).fit(dummies)
closest, _ = pairwise_distances_argmin_min(km.cluster_centers_, dummies)
closest
Below is Youtuber Sentdex's machine learning code, and I couldn't understand some parts.
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, model_selection
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)
df.drop(['body', 'name'], 1, inplace=True)
df.fillna(0, inplace=True)
def handle_non_numerical_data(df):
columns = df.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if df[column].dtype != np.int64 and df[column].dtype != np.float64:
column_contents = df[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
# creating dict that contains new
# id per unique string
text_digit_vals[unique] = x
x += 1
df[column] = list(map(convert_to_int, df[column]))
return df
df = handle_non_numerical_data(df)
df.drop(['ticket', 'home.dest'], 1, inplace=True)
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])
clf = MeanShift()
clf.fit(X)
labels= clf.labels_ ###Can't understand###
cluster_centers = clf.cluster_centers_
original_df['cluster_group'] = np.nan
for i in range(len(X)):
original_df['cluster_group'].iloc[i] = labels[i]
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
temp_df = original_df[(original_df['cluster_group'] == float(i))]
# print(temp_df.head())
survival_cluster = temp_df[(temp_df['survived'] == 1)]
survival_rate = len(survival_cluster) / len(temp_df)
# print(i,survival_rate)
survival_rates[i] = survival_rate
print(survival_rates)
Supposedly in "labels = clf.labels_", labels are [0 : 5] (when I ran program and I got those numbers). But here's the question. Where do those numbers come from? and why 0,1,2? why not bigger number?
scikitlearn's documentation on Meanshift provides an explanation of the labels_ attribute that you seem confused about. Taken directly from the documentation
labels_ :
Labels of each point.
If you're more confused about what these labels represent, a brief explanation would be that the number refers to what bin that specific point was clustered into. So all the points with a value of 0 would all belong to the same cluster, and all the points with a value of 1 would all belong to the same cluster, and so on. What the value of these labels are doesn't really matter, they're just here to be able to identify which cluster the data point belongs to.
I'd recommend reading more about clustering if you're still confused about why you would want to label the data.
We were given some code for a support vector machine where we are supposed to implement leave one out cross validation. If I understand it correctly leave one out will create as many test sets as there are samples, which means that for a big data set the process will be costly and most likely take quite long to generate results.
I have tried to implement leave one out to the given svm code with only one iteration and with 773 data points in total. I expected it to take some time but as of 2 h later the code is still running without any result, which makes me believe that it might be stuck in some loop or something...
Is there any suggestion as to what might be wrong? I'm not getting any error code either.
The entire code is as following, with the leave one out part is in the last function at the bottom (executed in jupyter notebook online binder):
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy as gp
from gseapy.plot import gseaplot
import qvalue
from ipywidgets import interact, interact_manual
from ipywidgets import IntSlider, FloatSlider, Dropdown, Text
import sklearn as skl
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut
from sklearn import svm
interact_enrich=interact_manual.options(manual_name="Enrichment analysis")
interact_plot=interact_manual.options(manual_name="Plot")
interact_calc=interact_manual.options(manual_name="Calculate tests")
interact_gen=interact_manual.options(manual_name="Initialize data")
interact_SVM=interact_manual.options(manual_name="Train SVM")
clinical_data = pd.read_csv('../data/brca_clin.tsv.gz', sep ='\t', index_col=2)
clinical_data = clinical_data.iloc[4:,1:]
expression_data = pd.read_csv('../data/brca.tsv.gz', sep ='\t', index_col=1)
expression_data = expression_data.iloc[:,2:].T
def split_data(clinical_df, expression_df, separator, cond1, cond2):
try:
group1 = clinical_df[separator] == cond1
index1 = clinical_df[group1].index
group2 = clinical_df[separator] == cond2
index2 = clinical_df[group2].index
except:
print('Clinical condition wrong')
expression1 = expression_df.loc[index1].dropna()
expression2 = expression_df.loc[index2].dropna()
expression = pd.concat([expression1, expression2])
X = expression.values
y = np.append(np.repeat(0, len(expression1)), np.repeat(1, len(expression2)))
display(pd.DataFrame([len(index1),len(index2)], columns = ['Number of points'], index = ['Group 1', 'Group 2']))
return X, y
def plot_pca_variance(X, scale=False, ncomp = 1):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA()
pca.fit(X)
plt.rcParams["figure.figsize"] = (20,10)
sns.set(style='darkgrid', context='talk')
plt.plot(np.arange(1,len(pca.explained_variance_ratio_)+1),np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.vlines(ncomp, 0, plt.gca().get_ylim()[1], color='r', linestyles = 'dashed')
h = np.cumsum(pca.explained_variance_ratio_)[ncomp -1]
plt.hlines(h, 0, plt.gca().get_xlim()[1], color='r', linestyles = 'dashed')
plt.title(str(ncomp) + ' components, ' + str(round(h, 3)) + ' variance explained')
plt.show()
def reduce_data(X, n, scale=True):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA(n_components=n)
Xr = pca.fit_transform(X)
return Xr
def interact_split_data(Criteria, Group_1, Group_2):
global BRCA_X, BRCA_y
BRCA_X, BRCA_y = split_data(clinical_data, expression_data, Criteria, Group_1, Group_2)
def interact_SVM_1(Rescale, Max_iterations):
max_iter = int(Max_iterations)
loo = LeaveOneOut()
ac_matrix_train, ac_matrix_test = np.array([]), np.array([])
for train_id, test_id in loo.split(BRCA_X, BRCA_y):
X_train, X_test, y_train, y_test = BRCA_X[train_id,:], BRCA_X[test_id,:], BRCA_y[train_id],BRCA_y[test_id]
clf = svm.LinearSVC(C=0.1,max_iter=100000).fit(X_train, y_train) # Train an SVM
y_train_pred = clf.predict(X_train)
ac_matrix_train = confusion_matrix(y_train, y_train_pred)
y_test_pred = clf.predict(X_test)
ac_matrix_test = confusion_matrix(y_test, y_test_pred)
display(pd.DataFrame(np.concatenate((ac_matrix_train,ac_matrix_test), axis =1), columns = ["predicted G1 (training)","predicted G2 (training)", "predicted G1 (test)","predicted G2 (test)"],index=["actual G1","actual G2"]))
interact_gen(interact_split_data, Criteria=Text('PR status by ihc'), Group_1 = Text('Positive'), Group_2=Text('Negative'))
interact_SVM(interact_SVM_1, Rescale = False, Max_iterations = Text('1')) ```