Understanding problems in scikit learn in Python - python

Below is Youtuber Sentdex's machine learning code, and I couldn't understand some parts.
import numpy as np
from sklearn.cluster import MeanShift, KMeans
from sklearn import preprocessing, model_selection
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_excel('titanic.xls')
original_df = pd.DataFrame.copy(df)
df.drop(['body', 'name'], 1, inplace=True)
df.fillna(0, inplace=True)
def handle_non_numerical_data(df):
columns = df.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if df[column].dtype != np.int64 and df[column].dtype != np.float64:
column_contents = df[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
# creating dict that contains new
# id per unique string
text_digit_vals[unique] = x
x += 1
df[column] = list(map(convert_to_int, df[column]))
return df
df = handle_non_numerical_data(df)
df.drop(['ticket', 'home.dest'], 1, inplace=True)
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])
clf = MeanShift()
clf.fit(X)
labels= clf.labels_ ###Can't understand###
cluster_centers = clf.cluster_centers_
original_df['cluster_group'] = np.nan
for i in range(len(X)):
original_df['cluster_group'].iloc[i] = labels[i]
n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
temp_df = original_df[(original_df['cluster_group'] == float(i))]
# print(temp_df.head())
survival_cluster = temp_df[(temp_df['survived'] == 1)]
survival_rate = len(survival_cluster) / len(temp_df)
# print(i,survival_rate)
survival_rates[i] = survival_rate
print(survival_rates)
Supposedly in "labels = clf.labels_", labels are [0 : 5] (when I ran program and I got those numbers). But here's the question. Where do those numbers come from? and why 0,1,2? why not bigger number?

scikitlearn's documentation on Meanshift provides an explanation of the labels_ attribute that you seem confused about. Taken directly from the documentation
labels_ :
Labels of each point.
If you're more confused about what these labels represent, a brief explanation would be that the number refers to what bin that specific point was clustered into. So all the points with a value of 0 would all belong to the same cluster, and all the points with a value of 1 would all belong to the same cluster, and so on. What the value of these labels are doesn't really matter, they're just here to be able to identify which cluster the data point belongs to.
I'd recommend reading more about clustering if you're still confused about why you would want to label the data.

Related

Having some problem to understand the x_bin in regplot of Seaborn

I used the seaborn.regplot to plot data, but not quite understand how the error bar in regplot was calculated. I have compared the results with the mean and standard deviation derived from mannual calculation. Here is my testing script.
import numpy as np
import pandas as pd
import seaborn as sn
def get_data_XYE(p):
x_list = []
lower_list = []
upper_list = []
for line in p.lines:
x_list.append(line.get_xdata()[0])
lower_list.append(line.get_ydata()[0])
upper_list.append(line.get_ydata()[1])
y = 0.5 * (np.asarray(lower_list) + np.asarray(upper_list))
y_error = np.asarray(upper_list) - y
x = np.asarray(x_list)
return x, y, y_error
x = [37.3448,36.6026,42.7795,34.7072,75.4027,226.2615,192.7984,140.8045,242.9952,458.451,640.6542,726.1024,231.7347,107.5605,200.2254,190.0006,314.1349,146.8131,152.4497,175.9096,284.9926,116.9681,118.2953,312.3787,815.8389,458.0146,409.5797,595.5373,188.9955,15.7716,36.1839,244.8689,57.4579,94.8717,112.2237,87.0687,72.79,22.3457,24.1728,29.505,80.8765,252.7454,280.6002,252.9573,348.246,112.705,98.7545,317.0541,300.9573,402.8411,406.6884,56.1286,30.1385,32.9909,497.556,19.3606,20.8409,95.2324,108.6074,15.7753,54.5511,45.5623,64.564,101.1934,81.8459,88.286,58.2642,56.1225,51.2943,38.0649,63.5882,63.6847,120.495,102.4097,49.3255,111.3309,171.6028,58.9526,28.7698,144.6884,180.0661,116.6028,146.2594,199.8702,128.9378,423.2363,119.8537,124.6508,518.8625,306.3023,79.5213,121.0309,116.9346,170.8863,930.361,48.9983,55.039,47.1092,72.0548,75.4045,103.521,83.4134,142.3253,146.6215,121.4467,101.4252,68.4812,291.4275,143.9475,142.647,78.9826,47.094,204.2196,89.0208,82.792,27.1346,142.4764,83.7874,67.3216,112.9531,138.2549,133.3446,86.2659,45.3464,56.1604,43.5882,54.3623,86.296,115.7272,96.5498,111.8081,36.1756,40.2947,34.2532,89.1452,53.9062,36.458,113.9297,176.9962,77.3125,77.8891,64.807,64.1515,127.7242,119.6876,976.2324,322.8454,434.2883,168.6923,250.0284,234.7329,131.0793,152.335,118.8838,243.1772,24.1776,168.6327,170.7541,167.8444,75.9315,110.1045,113.4417,60.5464,66.8956,79.7606,71.6659,72.5251,77.513,207.8019,21.8592,35.2787,169.7698,146.5012,412.9934,248.0708,318.5489,104.1278,184.7592,108.0581,175.2646,169.7698,340.3732,570.3396,23.9853,69.0405,66.7391,67.9435,294.6085,68.0537,77.6344,433.2713,104.3178,229.4615,187.8587,78.1399,121.4737,122.5451,384.5935,38.5232,117.6835,50.3308,318.2513,103.6695,20.7181,321.9601,510.3248,13.4754,16.1188,44.8082,37.7291,733.4587,446.6241,21.1822,287.9603,327.2367,274.1109,195.4713,158.2114,64.4537,26.9857,172.8503]
y = [37,40,30,29,24,23,27,12,21,20,29,28,27,32,23,29,28,22,28,23,24,29,32,18,22,12,12,14,29,31,34,31,22,40,25,36,27,27,29,35,33,25,25,27,27,19,35,26,18,24,25,37,52,47,34,39,40,48,41,44,35,36,53,46,38,44,23,26,26,28,27,21,25,21,20,27,35,24,46,34,22,30,30,30,31,26,25,28,21,31,24,27,33,21,31,33,29,33,32,21,25,22,39,31,34,26,23,18,20,18,34,25,20,12,23,25,21,21,25,31,17,27,28,29,25,24,25,21,24,27,23,22,23,22,22,26,22,19,26,35,33,35,29,26,26,30,22,32,33,33,28,32,26,29,36,37,37,28,24,30,25,20,29,24,33,35,30,32,31,33,40,35,37,24,34,29,27,24,36,26,26,26,27,27,20,17,28,34,18,20,20,18,19,23,20,22,25,32,44,41,39,41,40,44,36,42,31,32,26,29,23,29,29,28,31,22,29,24,28,28,25]
xbreaks = [13.4754, 27.1346, 43.5882, 58.9526, 72.79, 89.1452, 110.1045, 131.0793, 158.2114, 180.0661, 207.8019, 234.7329, 252.9573, 300.9573, 327.2367, 348.246, 412.9934, 434.2883, 458.451, 518.8625, 595.5373, 640.6542, 733.4587, 815.8389, 930.361, 976.2324]
df = pd.DataFrame([x,y]).T
df.columns = ['x','y']
# Check the bin average and std using agge
bins = pd.cut(df.x,xbreaks,right=False)
t = df[['x','y']].groupby(bins).agg({"x": "mean", "y": ["mean","std"]})
t.reset_index(inplace=True)
t.columns = ['range_cut','x_avg_cut','y_avg_cut','y_std_cut']
t.index.name ='id'
# Get the bin average from
g = sns.regplot(x='x',y='y',data=df,fit_reg=False,x_bins=xbreaks,seed=seed)
xye = pd.DataFrame(get_data_XYE(g)).T
xye.columns = ['x_regplot','y_regplot','e_regplot']
xye.index.name = 'id'
t2 = xye.merge(t,on='id',how='left')
t2
You can see the y and e from the two ways are different. I understand that the default x_ci or x_estimator may afect the result of regplot, but I still can not the these values in excel by removing some lowest and/or highest values in each bin.
In seaborn.regplot, the x_bins are the center of each bin, and the original x values are assigned to the nearest bin value. Whereas in pandas.cut, the breaks define the bin edges.

VIF function returns all 'inf' values

I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.

my algorithm gives bad clusters while usingTF-IDF

im getting bad clusters i would like to rewrite it in a way where i can just plug in any algorithm that i would like (e.g hierarchical, knn, k-means) etc.
#takes in our text_extracts dictionary and returns clusters in an indexed list
def run_clustering(plan):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
max_df=0.5,
min_df=0.005,
ngram_range=(1,4),
lowercase=True)
#set the model with the vectorizer which will tokenize with our process_text function
extracts = {}
for page in plan.page_list:
if len(page.text_extract) > 50:
extracts[str(page.document_id) + '_' + str(page.page_number)] = page.text_extract
extract_lst = [extracts[text] for text in extracts]
tfidf_model = vectorizer.fit_transform(extract_lst)
#determine cluster number with silhouette coefficient
#start with 2 as a cluster size in case the set is very small
num_of_clusters_to_test = [2]
#going to test 25 more sizes in equal intervals based on the number of docs we are clustering
intervals_to_test = int(len(extracts) / 25)
#print(intervals_to_test)
num_of_clusters_to_test += [i for i in range(len(extracts)) if i % intervals_to_test == 0 and i != 0]
#these variables will help us determine the max silhouette
#iters_since_new_max is just being held so that if we aren't reaching optimal size for
#four iterations in a row, we dont have to keep testing huge cluster sizes
max_silhouette_coef = 0
iters_since_new_max = 0
good_size = 2
#cluster with a certain cluster size and record the silhouette coefficient
for size in num_of_clusters_to_test:
kmeans = KMeans(n_clusters=size).fit(tfidf_model)
label = kmeans.labels_
sil_coeff = silhouette_score(tfidf_model, label, metric='euclidean')
if sil_coeff > max_silhouette_coef:
max_silhouette_coef = sil_coeff
good_size = size
iters_since_new_max = 0
else:
iters_since_new_max += 1
if iters_since_new_max > 4:
break
# finally cluster for with the good size we want
km_model = KMeans(n_clusters=good_size)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
left as much comment as i can to help you all follow what i am going for can anyone help me improve this
You know KMeans if for numeric data only, right. I mean, don't expect it to work on labeled data. With KMeans, you calculate the distance to the nearest centroid (cluster center) and add this point to this cluster. What is the 'distance' between apple, banana, and watermelon? It doesn't make sense! So, just make sure you are running your KMeans over numerics.
import numpy as np
import pandas as pd
from pylab import plot,show
from numpy import vstack,array
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv('foo.csv')
# get only numeric fields from your dataframe
df = df.sample(frac=0.1, replace=True, random_state=1)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
for col in newdf.columns:
print(col)
# your independent variables
X = newdf[['NumericField1','NumericField2','NumericField3','list_price']]
# your dependent variable
y = newdf['DependentVariable']
# take all numeric features from the corr exercise, and turn into an array
# so we can feed it into a cluetering algorythm
data = np.asarray(newdf)
X = data
# computing K-Means with K = 100 (100 clusters)
centroids,_ = kmeans(data,100)
# assign each sample to a cluster
idx,_ = vq(data,centroids)
# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
data[idx==1,0],data[idx==1,1],'oy',
data[idx==2,0],data[idx==2,1],'or',
data[idx==3,0],data[idx==3,1],'og',
data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
show()
details = [(name,cluster) for name, cluster in zip(df.brand,idx)]
for detail in details:
print(detail)
I've found Affinity Propogation to produce much tighter clusters than KMeans can achieve. Here is an example.
# Run Affinity Propogation Experiment
af = AffinityPropagation(preference=20).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
# plt.scatter(X[:, 0], X[:, 1], s=50)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
Try these concepts and see how you get along.

KMeans not returning reproducible results in sklearn, even fixing random_state

The following code tests KMeans for several n_clusters and tries to find the "best" n_clusters by the inertia criterion. However, it is not reproducible: even fixing random_state, every time I call kmeans(df) on the same dataset, it generates different clustering - and even different n_clusters. Am I missing something here?
from sklearn.cluster import KMeans
from tqdm import tqdm_notebook
def kmeans(df):
inertia = []
models = {}
start = 3
end = 40
for i in tqdm_notebook(range (start, end)):
k = KMeans(n_clusters=i, init='k-means++', n_init=50, random_state=10, n_jobs=-1).fit(df.values)
inertia.append(k.inertia_)
models[i] = k
ep = np.argmax(np.gradient(np.gradient(np.array(inertia)))) + start
return models[ep]
I am having this same issue. I think that a closer solution is to freeze the model into a file and import the model and then cluster a new predict phrase, I think if the vectorizer and kmeans clustering is initialized every single time the program it will run, it seems to order the clusters in a different order every time and the hashmap will not activate correclty and give you a different number every time the function is called
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
# Sample array of string sentences
df = pd.read_csv('/workspaces/codespaces-flask//data/shuffled.csv')
df = shuffle(df)
sentences = df['text'].values
# Convert the sentences into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
# Perform K-Means clustering
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(X)
output = zip(sentences, clusters)
# Print the cluster assignments for each sentence
for sentence, cluster in zip(sentences, clusters):
print("Sentence:", sentence, "Cluster:", cluster)
df = pd.DataFrame(output)
db_file_name = '/workspaces/codespaces-flask/ThrAive/data/database1.db'
conn = sqlite3.connect(db_file_name)
cursor = conn.cursor()
cursor.execute("SELECT journal_text FROM Journal JOIN User ON Journal.id
= user.id
rows = cursor.fetchall()
conn.commit()
conn.close()
df1 = pd.DataFrame(rows)
df1 = df1.applymap(lambda x: " ".join(x.split()) if isinstance(x, str)
else x)
entry = df1
entry = entry
print(entry)
entry = entry[0].iloc[-1].lower()
entry = [entry]
new_X = vectorizer.transform(entry)
# Predict the cluster assignments for the new sentences
new_clusters = kmeans.predict(new_X)
for entry, new_cluster in zip(entry, new_clusters):
print("Sentence:", entry, "Cluster:", new_cluster)
zipper = zip(entry, new_clusters)
df = pd.DataFrame(zipper)
df = df.applymap(lambda x: " ".join(x.split()) if isinstance(x, str)
else x)
df = df.to_string( header=False, index=False)
entry = df
output = entry
numbers = ['0', '1', '2', '3', '4','5','6','7','8']
names =
# Create a dictionary that maps numbers to names
number_to_name = {number: name for number, name in zip(numbers, names)}
print(output[-1])
output = number_to_name[output[-1]]
json_string = json.dumps(str(output))
I think that the solution is saving the model to disk
import pickle
# Train a scikit-learn model
model = ///
# Save the model to disk
with open('model.pkl', 'wb') as file:
pickle.dump(model, file)
and then load the pickle file and test it on the k-means without re-initializing the cluster.

python: increase performance of finding the best timeshift for a correlation between each X column and y

I have a dataframe X with several columns and a dataframe y with only one column (series). The rows in X represent timesteps and I want to find the interval I need to shift each column of X to obtain the highest correlation with y. I wrote a function that loops over all columns and then loops over all timesteps and correlates the X column with y. If the R² is better than before I store the timestep. However, with over 300 columns this routine is really taking some time and I need to increase the performance. Is there a nice way to simplify this code?
(In the example I used the iris data set which is of course not a timeseries...)
from sklearn import datasets
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from copy import deepcopy
def get_best_shift(dfX, dfy, ti=60, maxt=1440):
"""
determines the best correlation for the last maxt minutes based on a
timestep of ti minutes. Creates a dataframe with the shifted variables based on the
best match (strongest correlation).
"""
df_out = deepcopy(dfX)
for xcol in dfX:
bestshift = 0
Rmax = 0
for ishift in range(0, int(maxt / ti)):
xvals = dfX[xcol].iloc[0:(dfX.shape[0] - ishift)].values
yvals = np.array([val[0] for val in dfy.iloc[ishift:dfy.shape[0]].values])
selector = np.array([str(val)!="nan" for val in (xvals*yvals)],dtype=bool)
xvals = xvals[selector]
yvals = yvals[selector]
R = np.corrcoef(xvals,yvals)[0][1]
# plt.figure()
# plt.plot(xvals,yvals,'k.')
# plt.show()
if R ** 2 > Rmax:
Rmax = R ** 2
# print(Rmax)
bestshift = ishift
df_out[xcol] = list(np.zeros(bestshift)) + list(dfX[xcol].iloc[0:dfX.shape[0] - bestshift].values)
df_out = df_out.rename(columns={xcol: ''.join([str(xcol), '_t-', str(bestshift)])})
return df_out
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
y = pd.DataFrame(iris.target)
df = get_best_shift(X,y)

Categories

Resources