i just recently started learning data science. this is what i wrote:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score
import numpy as np
#reading data
df = pd.read_csv('titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values
#spliting data into train/test
kf = KFold(n_splits=4+1, shuffle=True, random_state=10)
tree_scores = {'accuracy_scores':[],'precision_scores':[],'recall_scores':[]}
logistic_scores = {'accuracy_scores':[],'precision_scores':[],'recall_scores':[]}
#making the models
for train_indexes, test_indexes in kf.split(X):
X_train, X_test = X[train_indexes], X[test_indexes]
y_train, y_test = y[train_indexes], y[test_indexes]
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_scores['accuracy_scores'].append(tree.score(X_test,y_test))
tree_prediction = tree.predict(X_test)
#tree_scores['precision_scores'].append(tree.precision_score(y_test,tree_prediction))
#tree_scores['recall_scores'].append(tree.recall_score(y_test,tree_prediction))
logistic = LogisticRegression()
logistic.fit(X_train,y_train)
logistic_scores['accuracy_scores'].append(logistic.score(X_test,y_test))
logistic_prediction = logistic.predict(X_test)
logistic_scores['precision_scores'].append(precision_score(y_test,logistic_prediction))
logistic_scores['recall_scores'].append(recall_score(y_test,logistic_prediction))
print("Decision Tree")
print(" accuracy:", np.mean(tree_scores['accuracy_scores']))
print(" precision:", np.mean(tree_scores['precision_scores']))
print(" recall:", np.mean(tree_scores['recall_scores']))
print("Logistic Regression")
print(" accuracy:", np.mean(logistic_scores['accuracy_scores']))
print(" precision:", np.mean(logistic_scores['precision_scores']))
print(" recall:", np.mean(logistic_scores['recall_scores']))
the two lines commented in for loop give error for both precision and recall, i dont know why. ALthough before when i was running both precision n recall they worked. and i cant seem to find any spelling mistake either.
i wonder if the different python syntaxes are messing with sklearn? because once i tried a combination like this:
X = df.loc['Plass':'Fare'].values
y = df.Survived.values
and it gave errors but when i used normal expected way it worked fine.
(note: the code may be wrongly indented, first time using stackexchange guys.)
DecisionTreeClassifier doesn't have such a method indeed.
You need to change:
tree_scores['precision_scores'].append(tree.precision_score(y_test,tree_prediction))
tree_scores['recall_scores'].append(tree.recall_score(y_test,tree_prediction))
to:
tree_scores['precision_scores'].append(precision_score(y_test,tree_prediction))
tree_scores['recall_scores'].append(recall_score(y_test,tree_prediction))
and you're fine to go
Related
I'm testing machine learning methods on a csv file with kickstarter project data. But even though I can get "accuracy score", I get the following error when I try to get "r2 score". What would be the reason?
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
veri = pd.read_csv("kick_rev.csv")
veri = veri.drop(['id'], axis=1)
veri = veri.drop(['i'], axis=1)
y = np.array(veri['state_num'])
x = np.array(veri.drop(['state_num','usd_goal_real','deadline','launched','country'], axis=1))
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
DTR = DecisionTreeRegressor()
DTR.fit(X_train,y_train)
ytahmin = DTR.predict(x)
DTR.fit(veri[['goal','pledged','backers','usd_pledged','usd_pledged_real','category_num','category_main_num','currency_num','country_num']],veri.state_num)
accuracy_score = DTR.score(X_test,y_test)
a = np.array([5000,94175.0,1,57763.8,6469.73,13,6,0,0]).reshape(1, -1)
predict_DTR = DTR.predict(a)
r2 = DTR.r2_score(X_test, y_test)
print(accuracy_score)
print(r2)
Error:
AttributeError: 'DecisionTreeRegressor' object has no attribute 'r2_score'
R2 Score is between predicted and actual value. So you can't use Train features and prediction for comparision
r2_score(y_pred, y_true)
You can use this link for more clarification
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
Here is the error message I get when the Python code below is entered. This code came from a machine learning ebook and I simply altered it to match the data set ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
Cross Validation Classification ROC AUC
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('Diabetes_Classification.csv')
array = df.values
X = array[:,0:14]
Y = array[:,14]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression(solver='liblinear')
scoring = 'roc_auc'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("AUC: %.3f (%.3f)" % (results.mean(), results.std()))
I have tried to run a SVM program, and I got the above error. The code is here below. Please point out the error in it.
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
data = pd.read_csv('risk_factors_cervical_cancer.csv')
X = np.array(data[[#some data elements]])
y = np.array(data[#some data elements])
print(X)
print(y)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=30)
classifier = svm.SVC()
classifier.fit(X_train, y_train) #the error occurs here
y_pred = svm.predict(X_test)
acc = accuracy_score(y_test, y_pred)
`
As #Guimoute wrote, preprocessing your data is always necessary in order to train it with any machine learning algorithm. Try X.head(10) to get an introduction to the data you are using. Your error occurs because there is a value "?" in your X column. Replace it with some reasonable number, i.e. the mean of the column for example in order to get better results.
I'm working on the Titanic competition on Spyder IDE. The code is barely complete but I'm doing it one step at a time (and this is the first time I've ever built a learning model). Now, I'm getting a Found input variables with inconsistent numbers of samples: [891, 183] error in the log while trying to run my code. This is what I have so far:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
train_path = "C:\\Users\\Omar\\Downloads\\Titanic Data\\train.csv"
train_data = pd.read_csv(train_path)
columns_of_interest = ['Survived','Pclass', 'Sex', 'Age']
filtered_titanic_data = train_data.dropna(axis=0)
x = train_data[columns_of_interest]
y = filtered_titanic_data.Survived
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state=0)
titanic_model = DecisionTreeRegressor()
titanic_model.fit(train_x, train_y)
val_predictions = titanic_model.predict(val_x)
print(filtered_titanic_data)
Idk whether its coming from the excel file or the parameters. I'm sorry if this is a simple question. I couldn't implement other people's solutions.
The error is because you are taking labels from filtered data and taking x from unfiltered data
Change the following line
x = train_data[columns_of_interest]
to
x = filtered_titanic_data[columns_of_interest]
I am trying to run a kNN (k-nearest neighbour) algorithm in Python.
The dataset I am using to try and do this is available at the UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/wine
Here is the code I am using:
#1. LIBRARIES
import os
import pandas as pd
import numpy as np
print os.getcwd() # Prints the working directory
os.chdir('C:\\file_path') # Provide the path here
#2. VARIABLES
variables = pd.read_csv('wines.csv')
winery = variables['winery']
alcohol = variables['alcohol']
malic = variables['malic']
ash = variables['ash']
ash_alcalinity = variables['ash_alcalinity']
magnesium = variables['magnesium']
phenols = variables['phenols']
flavanoids = variables['flavanoids']
nonflavanoids = variables['nonflavanoids']
proanthocyanins = variables['proanthocyanins']
color_intensity = variables['color_intensity']
hue = variables['hue']
od280 = variables['od280']
proline = variables['proline']
#3. MAX-MIN NORMALIZATION
alcoholscaled=(alcohol-min(alcohol))/(max(alcohol)-min(alcohol))
malicscaled=(malic-min(malic))/(max(malic)-min(malic))
ashscaled=(ash-min(ash))/(max(ash)-min(ash))
ash_alcalinity_scaled=(ash_alcalinity-min(ash_alcalinity))/(max(ash_alcalinity)-min(ash_alcalinity))
magnesiumscaled=(magnesium-min(magnesium))/(max(magnesium)-min(magnesium))
phenolsscaled=(phenols-min(phenols))/(max(phenols)-min(phenols))
flavanoidsscaled=(flavanoids-min(flavanoids))/(max(flavanoids)-min(flavanoids))
nonflavanoidsscaled=(nonflavanoids-min(nonflavanoids))/(max(nonflavanoids)-min(nonflavanoids))
proanthocyaninsscaled=(proanthocyanins-min(proanthocyanins))/(max(proanthocyanins)-min(proanthocyanins))
color_intensity_scaled=(color_intensity-min(color_intensity))/(max(color_intensity)-min(color_intensity))
huescaled=(hue-min(hue))/(max(hue)-min(hue))
od280scaled=(od280-min(od280))/(max(od280)-min(od280))
prolinescaled=(proline-min(proline))/(max(proline)-min(proline))
alcoholscaled.mean()
alcoholscaled.median()
alcoholscaled.min()
alcoholscaled.max()
#4. DATA FRAME
d = {'alcoholscaled' : pd.Series([alcoholscaled]),
'malicscaled' : pd.Series([malicscaled]),
'ashscaled' : pd.Series([ashscaled]),
'ash_alcalinity_scaled' : pd.Series([ash_alcalinity_scaled]),
'magnesiumscaled' : pd.Series([magnesiumscaled]),
'phenolsscaled' : pd.Series([phenolsscaled]),
'flavanoidsscaled' : pd.Series([flavanoidsscaled]),
'nonflavanoidsscaled' : pd.Series([nonflavanoidsscaled]),
'proanthocyaninsscaled' : pd.Series([proanthocyaninsscaled]),
'color_intensity_scaled' : pd.Series([color_intensity_scaled]),
'hue_scaled' : pd.Series([huescaled]),
'od280scaled' : pd.Series([od280scaled]),
'prolinescaled' : pd.Series([prolinescaled])}
df = pd.DataFrame(d)
#5. TRAIN-TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.matrix(df),np.matrix(winery),test_size=0.3)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape
#6. K-NEAREST NEIGHBOUR ALGORITHM
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
In section 5, when I run sklearn.model_selection to import the train-test split mechanism, this does not appear to be running correctly because it provides the shapes: (0,13) (0,178) (1,13) (1,178).
Then, upon trying to run the knn, I get the error message: Found array with 0 sample(s) (shape=(0,13)) while a minimum of 1 is required. This is not due to scaling with max-min normalisation as I still get this error message even when the variables are not scaled.
I'm not exactly sure where your code is going wrong, it's a slightly different way of going about it compared to the sklearn docs. However, I can show you a different way of getting the train test split to work on the wine dataset for you.
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X, y = load_wine(return_X_y=True)
X_scaled = MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
test_size=0.3)
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)