I try to practice Linear Regression by analyzing data file Google Apps Store to predict Rating, the file csv is on Kaggle.
After cleaning and trying to apply KNeighborsRegressor to run the model, as the results, the accuracy and r-squared are too low and I don't know why.
However the difference between predictions and y-test is not much and MSE is quite low.
I think there is some mistakes here, I hope you could help me to fix it. I would like to reach the accuracy about 90%.
import re
import sys
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
df = pd.read_csv('googleplaystore.csv')
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
replaces = [u'\u00AE', u'\u2013', u'\u00C3', u'\u00E3', u'\u00B3', '[', ']', "'"]
for i in replaces:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace(i, ''))
regex = [r'[-+|/:/;(_)#]', r'\s+', r'[A-Za-z]+']
for j in regex:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : re.sub(j, '0', x))
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)).astype(float)
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].median())
df.drop([10472], axis = 0, inplace = True)
le = preprocessing.LabelEncoder()
df['App'] = le.fit_transform(df['App'])
category_list = df['Category'].unique().tolist()
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1)
df['Genres'] = df['Genres'].str.split(';').str[0]
df['Genres'].replace('Music & Audio', 'Music', inplace =True)
le = preprocessing.LabelEncoder()
df['Genres'] = le.fit_transform(df['Genres'])
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
df['Type'] = pd.get_dummies(df['Type'])
def change_size(size):
if 'M' in size:
x = size[:-1]
x = float(x)*1000000
return(x)
elif 'k' == size[-1:]:
x = size[:-1]
x = float(x)*1000
return(x)
else:
return None
df['Size'] = df['Size'].apply(change_size)
df['Size'] = df['Size'].fillna(value=df['Size'].median(), axis = 0)
df['new'] = pd.to_datetime(df['Last Updated'])
df['lastupdate'] = (df['new'] - df['new'].max()).dt.days
features = ['App', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'lastupdate','Content Rating', 'Genres', 'Current Ver']
features.extend(category_list)
X = df[features]
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
model = KNeighborsRegressor(n_neighbors=28)
predictions = model.predict(X_test)
model.fit(X_train, y_train)
accuracy = model.score(X_test,y_test)
'Accuracy: ' + str(np.round(accuracy*100, 2)) + '%'
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
result = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
result
The way accuracy is being measured here is to exactly match the rating (as in a classifier). For example, if the actual rating is 4.3 and the prediction is 4.3001, it will be counted as an error. But, what is being done here is regression and in this case MSE and MAE are better metrics.
You could try bucketing the y_test and the predictions to get an idea of the "accuracy" like so:
>>> metrics.accuracy_score(np.around(y_test), np.around(predictions))
0.7666051660516605
Related
my code:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv('orderlist.csv', skiprows=1, delimiter=';', encoding="utf8")
df.columns = ["date", "customer_number", "item_code", "quantity"]
df['customer_item'] = df.customer_number + ', ' + df.item_code
df['date'] = pd.to_datetime(df['date'])
df["quantity"] = df["quantity"].astype(int, errors='ignore')
df["week"]=df.date.dt.week
df_grup = df.groupby(by=['week',"customer_item"]).quantity.sum().reset_index()
df_dum = pd.get_dummies(df_grup)
X, y = df_dum, df_dum["quantity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
dtree = DecisionTreeClassifier().fit(X_train, y_train)
predict = dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
pred_quantity = dtree.predict(df_dum)
print("predict quantity:")
print(pred_quantity)
result:
predict quantity:
[100 5 450 ... 295 22 639]
I need to print customer number next to own result .
the nth item of pred_quantity corresponds to the nth item in df['customer_number']
so you can either add pred_quantity as a column to df
df['pred_quantity'] = pred_quantity
print(df[['customer_number', 'pred_quantity']])
or use zip (docs) to print them side by side
for number, quantity in zip(df['customer_number'], pred_quantity)
print(number, quantity)
Here is my code.
import pandas as pd
import numpy as np
import json
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
training_data = pd.read_csv('/Users/aus10/Desktop/MLB_Data/Test_Training_Data/MLB_Training_Data.csv')
df_model = training_data.copy()
scaler = StandardScaler()
features = [['OBS', 'Runs']]
for feature in features:
df_model[feature] = scaler.fit_transform(df_model[feature])
test_data = pd.read_csv('/Users/aus10/Desktop/MLB_Data/Test_Training_Data/Test_Data.csv')
X = training_data.iloc[:,1] #independent columns
y = training_data.iloc[:,-1] #target column
X = X.values.reshape(-1,1)
results = []
# fit final model
model = XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
round(mean_squared_error(y_train, y_train_pred),2),
round(mean_squared_error(y_test, y_test_pred),2)
))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
# define one new data instance
index = 0
count = 0
while count < len(test_data):
team = test_data.loc[index].at['Team']
OBS = test_data.loc[index].at['OBS']
Xnew = [[ OBS ]]
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
results.append(
{
'Team': team,
'Runs': (round(ynew[0],2))
})
index += 1
count += 1
sorted_results = sorted(results, key=lambda k: k['Runs'], reverse=True)
df = pd.DataFrame(sorted_results, columns=[
'Team', 'Runs'])
writer = pd.ExcelWriter('/Users/aus10/Desktop/MLB_Data/ML/Results/Projected_Runs_XGBoost.xlsx', engine='xlsxwriter') # pylint: disable=abstract-class-instantiated
df.to_excel(writer, sheet_name='Sheet1', index=False)
df.style.set_properties(**{'text-align': 'center'})
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
writer.save()
and the error I'm getting is TypeError: Input data can not be a list.
The data coming from test_data is a csv with a team name and obs which is a float
like this NYY 0.324
Every way to solve it I've seen is just to put it in a 2d array like I did - Xnew = [[ OBS ]],
but I'm still getting the error.
Is there something else I need to do to the test_data coming in? I tried using values.reshape, but that didn't fix it either.
You need to transform your Xnew:
Xnew = np.array(Xnew).reshape((1,-1))
I am trying to switch from using Scalar to transform my data to quadratic.fit_transform
here is my code
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
training_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Training_data.csv")
test_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Test_Data.csv")
X = training_data.iloc[:,1:4] #independent columns
y = training_data.iloc[:,-1] #target column
model = LinearRegression()
quadratic = PolynomialFeatures(degree=2)
X_quad = quadratic.fit_transform(X)
model.fit(X_quad,y)
results = []
index = 0
count = 0
while count < len(test_data):
name = test_data.loc[index].at['Player_Name']
Scrambling = test_data.loc[index].at['Scrambling']
Total_Putts_GIR = test_data.loc[index].at['Total_Putts_GIR']
SG_Putting = test_data.loc[index].at['SG_Putting']
Xnew = [[ Scrambling, Total_Putts_GIR, SG_Putting ]]
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
results.append(
{
'Name': name,
'Projection': (round(ynew[0],2))
}
)
index += 1
count += 1
sorted_results = sorted(results, key=lambda k: k['Projection'], reverse=True)
df = pd.DataFrame(sorted_results, columns=[
'Name', 'Projection'])
writer = pd.ExcelWriter('/Users/aus10/Desktop/PGA/Regressions/Linear_Regressions/Results/Projections_LR_LL.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)
df.style.set_properties(**{'text-align': 'center'})
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
writer.save()
However, when I run this I get an error saying
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 10 is different from 3)
is there another step I need to add? Not sure why it's changing the size of my input data.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
training_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Training_data.csv")
test_data = pd.read_csv("/Users/aus10/Desktop/PGA/History/Memorial/PGA_Test_Data.csv")
X = training_data.iloc[:,1:4] #independent columns
y = training_data.iloc[:,-1] #target column
degree = 2
model=make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
round(mean_squared_error(y_train, y_train_pred),2),
round(mean_squared_error(y_test, y_test_pred),2)
))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
results = []
index = 0
count = 0
while count < len(test_data):
name = test_data.loc[index].at['Player_Name']
Scrambling = test_data.loc[index].at['Scrambling']
Total_Putts_GIR = test_data.loc[index].at['Total_Putts_GIR']
SG_Putting = test_data.loc[index].at['SG_Putting']
Xnew = [[ Scrambling, Total_Putts_GIR, SG_Putting ]]
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
results.append(
{
'Name': name,
'Projection': (round(ynew[0],2))
}
)
index += 1
count += 1
sorted_results = sorted(results, key=lambda k: k['Projection'], reverse=True)
df = pd.DataFrame(sorted_results, columns=[
'Name', 'Projection'])
writer = pd.ExcelWriter('/Users/aus10/Desktop/PGA/Regressions/Linear_Regressions/Results/Projections_LR_LR.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)
df.style.set_properties(**{'text-align': 'center'})
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 1000)
writer.save()
I'm using XGBoost for feature importance, I want to select the features that give me the 90 % of importance, so at first I build a Dataframe beacause I need it for excel and then I write a while cycle to evalutate the features that give me 90% of importances. After this there is a neural network (but it isn't in the code below). I know that maybe there are some easiest way to do this but it gives me an error:
ValueError: could not convert string to float: '0,25691372'
The code is
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from matplotlib import pyplot as plt
dataset = pd.read_csv('CompleteDataSet_original_Clean_CONC.csv', decimal=',', delimiter = ";")
from sklearn.metrics import r2_score
label = dataset.iloc[:,-1]
features = dataset.drop(columns = ['Label'])
y_max_pre_normalize = max(label)
y_min_pre_normalize = min(label)
def denormalize(y):
final_value = y*(y_max_pre_normalize-y_min_pre_normalize)+y_min_pre_normalize
return final_value
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, label, test_size = 0.20, random_state = 1, shuffle = True)
y_test2 = y_test1.to_frame()
y_train2 = y_train1.to_frame()
scaler1 = preprocessing.MinMaxScaler()
scaler2 = preprocessing.MinMaxScaler()
X_train = scaler1.fit_transform(X_train1)
X_test = scaler2.fit_transform(X_test1)
scaler3 = preprocessing.MinMaxScaler()
scaler4 = preprocessing.MinMaxScaler()
y_train = scaler3.fit_transform(y_train2)
y_test = scaler4.fit_transform(y_test2)
sel = XGBRegressor(colsample_bytree= 0.7, learning_rate = 0.005, max_depth = 5, min_child_weight = 3, n_estimators = 1000)
sel.fit(X_train, y_train)
importances = sel.feature_importances_
importances = [str(i) for i in importances]
importances = [i.replace(".", ",") for i in importances]
df1 = pd.DataFrame(features.columns)
df1.columns = ['Features']
df2 = pd.DataFrame(importances)
df2.columns = ['Importances [%]']
result = pd.concat([df1,df2],axis = 1)
result = result.sort_values(by='Importances [%]', ascending=False)
result.to_excel("Feature_Results.xlsx")
i = 0
somma = 0
feature = []
while somma <=0.9:
a = result.iloc[i,-1]
somma = float(a) + somma
feature.append(result.iloc[i,-2])
i = i + 1
float('0,25691372'.replace(",", "."))
You could use locale.atof() to handle , being used as the decimal separator.
import locale
locale.setlocale(locale.LC_ALL, 'fr_FR')
...
somma = locale.atof(a) + somma
Try to convert "0,0001" into "0.0001" and then convert the string to float.
I am very new to DataScience/Pandas in general. I mainly followed this and could get it to work using different classifiers.
import pandas as pd
import src.helper as helper
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
# Headings
headings = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing',
'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
# Load the data
shrooms = pd.read_csv('data/shrooms_no_header.csv', names=headings, converters={"header": float})
# Replace the ? in 'stalk-root' with 0
shrooms.loc[shrooms['stalk-root'] == '?', 'stalk-root'] = np.nan
shrooms.fillna(0, inplace=True)
# Remove columns with only one unique value
for col in shrooms.columns.values:
if len(shrooms[col].unique()) <= 1:
print("Removing column {}, which only contains the value: {}".format(col, shrooms[col].unique()[0]))
shrooms.drop(col, axis=1, inplace=True)
# Col to predict later
col_predict = 'class'
# Binary Encoding
all_cols = list(shrooms.columns.values)
all_cols.remove(col_predict)
helper.encode(shrooms, [col_predict])
# Expand Shrooms DataFrame to Binary Values
helper.expand(shrooms, all_cols)
# Remove the class we want to predict
x_all = list(shrooms.columns.values)
x_all.remove(col_predict)
# Set Train/Test ratio
ratio = 0.7
# Split the DF
df_train, df_test, X_train, Y_train, X_test, Y_test = helper.split_df(shrooms, col_predict, x_all, ratio)
# Try different classifier
# TODO: Batch Use to compare
classifier = GradientBoostingClassifier(n_estimators=1000)
# TODO: Optimize Hyperparamter (where applicable)
# Time the training
timer_start = time.process_time()
classifier.fit(X_train, Y_train)
timer_stop = time.process_time()
time_diff = timer_stop - timer_start
# Get the score
score_train = classifier.score(X_train, Y_train)
score_test = classifier.score(X_test, Y_test)
print('Train Score {}, Test Score {}, Time {}'.format(score_train, score_test, time_diff))
# TODO: Test a manual DataFrame
The "helpers" are functions I don't quite understand fully, but they work:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
def split_df(df, y_col, x_cols, ratio):
"""
This method transforms a dataframe into a train and test set, for this you need to specify:
1. the ratio train : test (usually 0.7)
2. the column with the Y_values
"""
mask = np.random.rand(len(df)) < ratio
train = df[mask]
test = df[~mask]
y_train = train[y_col].values
y_test = test[y_col].values
x_train = train[x_cols].values
x_test = test[x_cols].values
return train, test, x_train, y_train, x_test, y_test
def encode(df, columns):
for col in columns:
le = LabelEncoder()
col_values_unique = list(df[col].unique())
le_fitted = le.fit(col_values_unique)
col_values = list(df[col].values)
le.classes_
col_values_transformed = le.transform(col_values)
df[col] = col_values_transformed
def expand(df, list_columns):
for col in list_columns:
colvalues = df[col].unique()
for colvalue in colvalues:
newcol_name = "{}_is_{}".format(col, colvalue)
df.loc[df[col] == colvalue, newcol_name] = 1
df.loc[df[col] != colvalue, newcol_name] = 0
df.drop(list_columns, inplace=True, axis=1)
def correlation_to(df, col):
correlation_matrix = df.corr()
correlation_type = correlation_matrix[col].copy()
abs_correlation_type = correlation_type.apply(lambda x: abs(x))
desc_corr_values = abs_correlation_type.sort_values(ascending=False)
y_values = list(desc_corr_values.values)[1:]
x_values = range(0, len(y_values))
xlabels = list(desc_corr_values.keys())[1:]
fig, ax = plt.subplots(figsize=(8, 8))
ax.bar(x_values, y_values)
ax.set_title('The correlation of all features with {}'.format(col), fontsize=20)
ax.set_ylabel('Pearson correlatie coefficient [abs waarde]', fontsize=16)
plt.xticks(x_values, xlabels, rotation='vertical')
plt.show()
I would like to have a "manual" test, such as entering x attributes and getting a prediction based on that.
So for example, I hardcode a DataFrame like the following:
manual = pd.DataFrame({
"cap-shape": ["x"],
"cap-surface": ["s"],
"cap-color": ["n"],
"bruises": ["f"],
"odor": ["n"],
"gill-attachment": ["a"],
"gill-spacing": ["c"],
"gill-size": ["b"],
"gill-color": ["y"],
"stalk-shape": ["e"],
"stalk-root": ["?"],
"stalk-surface-above-ring": ["s"],
"stalk-surface-below-ring": ["s"],
"stalk-color-above-ring": ["o"],
"stalk-color-below-ring": ["o"],
"veil-type": ["p"],
"veil-color": ["o"],
"ring-number": ["o"],
"ring-type": ["p"],
"spore-print-color": ["o"],
"population": ["c"],
"habitat": ["l"]
})
How would I apply the same encoding? My code says helper.encode(manual, [col_predict]) but the manual ofc does not have a col_predict?
Please bear in mind I am a complete beginner, I searched the web a l ot, but I cannot come up with a proper source/tutorial that lets me test a single set.
The full code can be found here.
Try this:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
data = pd.read_csv('agaricus-lepiota.data.txt', header=None) #read data
data.rename(columns={0: 'y'}, inplace = True) #rename predict column (edible or not)
le = LabelEncoder() # encoder to do label encoder
data = data.apply(lambda x: le.fit_transform(x)) #apply LE to all columns
X = data.drop('y', 1) # X without predict column
y = data['y'] #predict column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = GradientBoostingClassifier()#you can pass arguments
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test) #it is predict for objects in test
print(accuracy_score(y_test, y_pred)) #check accuracy
I think you can read more about this in sklearn site.
Is this example what you want?
To check your manual data:
manual = manual.apply(lambda x: le.fit_transform(x))
clf.predict(manual)