I'm having trouble rounding decimals while encoding in python - python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
df = pd.read_csv("mushrooms.csv",index_col=False,header=None)
def n(target):
if target == 'p':
return 1
elif target == 'e':
return 0
df[0] = df[0].apply(n)
#manually encoding the targets
targets = df[0]
inputs = df[df.columns[1:]]
def test_train_split(mydf, inputs, tratio, target):
splitter = StratifiedShuffleSplit(n_splits = 1, test_size = tratio, random_state = 42)
train_index, test_index = next(splitter.split(inputs, target))
strat_train = mydf.iloc[train_index]
strat_test = mydf.iloc[test_index]
return strat_train, strat_test
def print_test_train_dfs(train_df, test_df, target_column = 'None'):
print("\nTraining data:")
train_df.info()
if target_column != 'None':
print(train_df[target_column].value_counts())
print('\nTest data:')
test_df.info()
if target_column != 'None':
print(test_df[target_column].value_counts())
traindf, testdf = test_train_split(df, inputs, 0.2, targets)
enc = OrdinalEncoder()
enc.fit(traindf)
df = enc.transform(testdf)
for i in range(len(df)):
for j in range(len(df[1])):
df[i][j].round(0)
df = pd.DataFrame.from_records(df)
print(df)
df always ends up with decimals like 1.0 instead of just 1 which is what I want.
The dataset I'm using is here
https://www.kaggle.com/uciml/mushroom-classification
I'll also add that after .transform, df is in more of an array than a dataframe

df.astype(int) should load as integer
Refer to this question for more information
Change data type of columns in Pandas

Related

using tf.keras.layers.Embedding for categorical variables in regression problem

Using the iris dataset as a hypothetical hello world example:
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
print(df.head())
Let us say I want to use tf.keras.layers.Embedding instead of one-hot/dummy encoding as part of ANN for regression. e.g.:
iris_class_name + sepalwidthcm + petallengthcm -> sepallengthcm
where sepallengthcm is the dependent variable. I came across this:
city_lookup = tf.keras.layers.StringLookup(vocabulary = city_vocabulary, mask_token = None);
city_embedding= tf.keras.Sequential([
city_lookup,
tf.keras.layers.Embedding(len(city_vocabulary) + 1, embedding_dimension)
], "city_embedding")
city = features["city"]
city_embedding_output = city_embedding(city)
but am not sure how to exactly use it in my use case. Any pointers very much welcome. Thanks!
You can map iris_class_name to n-dimensional vector representations and then concatenate with the other continuous features:
import pandas as pd
from sklearn import datasets
import numpy as np
import tensorflow as tf
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
vocab = df['iris_class_name'].unique()
embedding_dimension = 10
lookup = tf.keras.layers.StringLookup(vocabulary = vocab, mask_token = None)
embedding= tf.keras.Sequential([
lookup,
tf.keras.layers.Embedding(len(vocab) + 1, embedding_dimension)
])
names = df['iris_class_name'].to_numpy()
embedding_output = embedding(names)
features = np.concatenate((embedding_output, df[['sepalwidthcm', 'petallengthcm']].to_numpy()), axis=-1)
print(features.shape)
(150, 12)
Since you have 3 unique iris class names, you could also simply create an integer-to-vector dictionary manually, but it is up to you.

Leave one out cross validation Support vector machine

We were given some code for a support vector machine where we are supposed to implement leave one out cross validation. If I understand it correctly leave one out will create as many test sets as there are samples, which means that for a big data set the process will be costly and most likely take quite long to generate results.
I have tried to implement leave one out to the given svm code with only one iteration and with 773 data points in total. I expected it to take some time but as of 2 h later the code is still running without any result, which makes me believe that it might be stuck in some loop or something...
Is there any suggestion as to what might be wrong? I'm not getting any error code either.
The entire code is as following, with the leave one out part is in the last function at the bottom (executed in jupyter notebook online binder):
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy as gp
from gseapy.plot import gseaplot
import qvalue
from ipywidgets import interact, interact_manual
from ipywidgets import IntSlider, FloatSlider, Dropdown, Text
import sklearn as skl
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut
from sklearn import svm
interact_enrich=interact_manual.options(manual_name="Enrichment analysis")
interact_plot=interact_manual.options(manual_name="Plot")
interact_calc=interact_manual.options(manual_name="Calculate tests")
interact_gen=interact_manual.options(manual_name="Initialize data")
interact_SVM=interact_manual.options(manual_name="Train SVM")
clinical_data = pd.read_csv('../data/brca_clin.tsv.gz', sep ='\t', index_col=2)
clinical_data = clinical_data.iloc[4:,1:]
expression_data = pd.read_csv('../data/brca.tsv.gz', sep ='\t', index_col=1)
expression_data = expression_data.iloc[:,2:].T
def split_data(clinical_df, expression_df, separator, cond1, cond2):
try:
group1 = clinical_df[separator] == cond1
index1 = clinical_df[group1].index
group2 = clinical_df[separator] == cond2
index2 = clinical_df[group2].index
except:
print('Clinical condition wrong')
expression1 = expression_df.loc[index1].dropna()
expression2 = expression_df.loc[index2].dropna()
expression = pd.concat([expression1, expression2])
X = expression.values
y = np.append(np.repeat(0, len(expression1)), np.repeat(1, len(expression2)))
display(pd.DataFrame([len(index1),len(index2)], columns = ['Number of points'], index = ['Group 1', 'Group 2']))
return X, y
def plot_pca_variance(X, scale=False, ncomp = 1):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA()
pca.fit(X)
plt.rcParams["figure.figsize"] = (20,10)
sns.set(style='darkgrid', context='talk')
plt.plot(np.arange(1,len(pca.explained_variance_ratio_)+1),np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.vlines(ncomp, 0, plt.gca().get_ylim()[1], color='r', linestyles = 'dashed')
h = np.cumsum(pca.explained_variance_ratio_)[ncomp -1]
plt.hlines(h, 0, plt.gca().get_xlim()[1], color='r', linestyles = 'dashed')
plt.title(str(ncomp) + ' components, ' + str(round(h, 3)) + ' variance explained')
plt.show()
def reduce_data(X, n, scale=True):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA(n_components=n)
Xr = pca.fit_transform(X)
return Xr
def interact_split_data(Criteria, Group_1, Group_2):
global BRCA_X, BRCA_y
BRCA_X, BRCA_y = split_data(clinical_data, expression_data, Criteria, Group_1, Group_2)
def interact_SVM_1(Rescale, Max_iterations):
max_iter = int(Max_iterations)
loo = LeaveOneOut()
ac_matrix_train, ac_matrix_test = np.array([]), np.array([])
for train_id, test_id in loo.split(BRCA_X, BRCA_y):
X_train, X_test, y_train, y_test = BRCA_X[train_id,:], BRCA_X[test_id,:], BRCA_y[train_id],BRCA_y[test_id]
clf = svm.LinearSVC(C=0.1,max_iter=100000).fit(X_train, y_train) # Train an SVM
y_train_pred = clf.predict(X_train)
ac_matrix_train = confusion_matrix(y_train, y_train_pred)
y_test_pred = clf.predict(X_test)
ac_matrix_test = confusion_matrix(y_test, y_test_pred)
display(pd.DataFrame(np.concatenate((ac_matrix_train,ac_matrix_test), axis =1), columns = ["predicted G1 (training)","predicted G2 (training)", "predicted G1 (test)","predicted G2 (test)"],index=["actual G1","actual G2"]))
interact_gen(interact_split_data, Criteria=Text('PR status by ihc'), Group_1 = Text('Positive'), Group_2=Text('Negative'))
interact_SVM(interact_SVM_1, Rescale = False, Max_iterations = Text('1')) ```

Python SKlearn fit method not working

I'm working on a project using Python(3.6) and Sklearn.I have done classifications but when I try to apply it for reshaping in order to use it with fit method of sklearn it returns an error.
Here's what I have tried:
# Get all the columns from dataframe
columns = data.columns.tolist()
# Filter the columns to remove data we don't want
columns = [c for c in columns if c not in ["Class"] ]
# store the variables we want to predicting on
target = "Class"
X = data.drop(target, 1)
Y = data[target]
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)
# define a random state
state = 1
# define the outlier detection method
classifiers = {
"Isolation Forest": IsolationForest(max_samples=len(X),
contamination=outlier_fraction,
random_state=state),
"Local Outlier Factor": LocalOutlierFactor(
n_neighbors = 20,
contamination = outlier_fraction)
}
# fit the model
n_outliers = len(Fraud)
for i, (clf_name, clf) in enumerate(classifiers.items()):
# fit te data and tag outliers
if clf_name == "Local Outlier Factor":
y_pred = clf.fit_predict(X)
scores_pred = clf.negative_outlier_factor_
else:
clf.fit(X)
scores_pred = clf.decision_function(X)
y_pred = clf.predict(X)
# Reshape the prediction values to 0 for valid and 1 for fraudulent
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
n_errors = (y_pred != Y).sum()
# run classification metrics
print('{}:{}'.format(clf_name, n_errors))
print(accuracy_score(Y, y_pred ))
print(classification_report(Y, y_pred ))
Then it returns the following error:
ValueError: could not convert string to float: '301.48 Change: $0.00'
and it's pointed to `clf.fit(X)` line.
What have I configured wrong?
We can convert out dataset to numeric data values on the base of their uniqueness and you can also drop un-necessary columns form the dataset.
Here's how you can try that:
df_full = pd.read_excel('input/samp.xlsx', sheet_name=0,)
df_full = df_full[df_full.filter(regex='^(?!Unnamed)').columns]
df_full.drop(['paymentdetails',], 1, inplace=True)
df_full.drop(['timestamp'], 1, inplace=True)
# Handle non numaric data
def handle_non_numaric_data(df_full):
columns = df_full.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if df_full[column].dtype != np.int64 and df_full[column].dtype != np.float64:
column_contents = df_full[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
df_full[column] = list(map(convert_to_int, df_full[column]))
return df_full

Sorting csv data by headers but getting IndexError

It seems that my code fails when I try to set what headers/columns of data I want to use giving me an index error when trying to parse headers
import pandas as pd
import quandl
import math, datetime
import numpy as np
from sklearn import preprocessing , cross_validation, svm
from sklearn.linear_model import LinearRegression
import scipy
import matplotlib.pyplot as plt
from matplotlib import style
import pickle
style.use('ggplot')
df = pd.read_csv('convertcsv.csv',sep='\t')
df = np.array(df)
print(df)
df = df[['Open','High','Low','Close','Volume (BTC)']]
print("ok")
df['HL_PCT'] = (df['High'] - df['Close']) / df['Close'] * 100.0
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
df = df[['Close','HL_PCT','PCT_change','Volume (BTC)']]
forecast_col = 'Close'
df.fillna(-999999, inplace=True)
forecast_out = int(math.ceil(0.01*len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
X = np.array(df.drop(['label'],1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out:]
df.dropna(inplace=True)
y = np.array(df['label'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y,
test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
with open('linearregression.pickle','wb') as f:
pickle.dump(clf, f)
pickle_in = open('linearregression.pickle','rb')
clf =pickle.load(pickle_in)
accuracy = clf.score(X_test,y_test)
print(accuracy)
forecast_set = clf.predict(X_lately)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += one_day
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]
df['Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.pause(1)
plt.show()
print("we done?")`
...
I cant seem to figure out what I am doing wrong, it worked with the previous data set I was using, if it helps here is the format of the csv file that I was pulling from:
Timestamp,Open,High,Low,Close,Volume (BTC),Volume (Currency),Weighted Price
2017-09-30 00:00:00,4162.04,4177.63,4154.28,4176.08,114.81,478389.12,4166.96
2017-09-30 01:00:00,4170.84,4224.6,4170.84,4208.14,348.45,1463989.18,4201.4
I am not too experienced with this sort of stuff, and I tried to find other people with the same error but everyone was having a different sort of problem, I can include more data if it is needed.
You're converting your dataframe to a numpy array with df = np.array(df).
Don't expect a numpy array to function as a pandas dataframe.
Remove
df = np.array(df)
and you should be able to slice your matrix by column name with
df = df[['Open','High','Low','Close','Volume (BTC)']]

Stumped by this error: TypeError: 'PCA' object is not callable

This error eludes me, because after running type(PCAdata), it returns <class 'numpy.ndarray'>. After reading about similar "Module" object is not callable errors, it seems to concern not importing the object itself from within the class, such as "from PCA import PCA". However, I'm already importing PCA from sklean.decomposition.
Here is my data: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
#Load CSV
filename = 'data.csv'
data = pd.read_csv(filename)
df = pd.DataFrame(data)
df=df.dropna(axis=1,how='all')
array = df.values
X = array[:,2:32]
Y = array[:, 1]
#Normalize Data
def normalize(df):
result = df.copy()
for feature_name in df.columns:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result
df_normalized = normalize(df[ df.columns[2:32]])
pca = PCA(n_components = 16)
pca.fit_transform(df_normalized)
PCAdf = pd.DataFrame(pca.components_, columns = df_normalized.columns, index = ['PC-1','PC-2','PC-3','PC-4','PC-5','PC-6','PC-7','PC-8','PC-9','PC-10','PC-11','PC-12','PC-13','PC-14','PC-15','PC-16'])
PCAarray = PCAdf.values
#Convert all of the "M" class labels as 1, and "B" Labels as 0
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
df_v_y_encoded = encoder.transform(df_v_y)
#Train again, this time using features from principal component analysis.
classifierPCAfeatures = svm.SVC(gamma = "auto", C = 1, kernel = "rbf", decision_function_shape='ovo')
classifierPCAfeatures = fit(PCAdf, encoded_Y)
print(classifierPCAfeatures.score(df_v_x, df_v_y_encoded))

Categories

Resources