I am trying to build the following model but am getting this error when I am finally training the model and trying to get it's accuracy. It gets stuck when I am feedingg in the trainiing data in my linear model to train it.
Here is the whole code->
# Importing all needed libraries to build the model->
import tensorflow as tf
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow.compat.v2.feature_column as fc
print("All modules imported.")
# Loading a data_set to train our model-->
training_data = pd.read_csv("Real estate.csv")
testing_data = pd.read_csv("Real estate eval.csv")
# print(training_data.head())
y_train = training_data.pop("Y house price of unit area")
y_test = testing_data.pop("Y house price of unit area")
# print(training_data.head())
print(y_train)
print(y_test)
numerical_colunms = ["No","X1 transaction date","age","X3 distance to the nearest MRT station","X4 number of convenience stores",
"X5 latitude","X6 longitude"]
feature_colunms=[]
for feature_name in numerical_colunms:
print(feature_name)
feature_colunms.append(tf.feature_column.numeric_column(feature_name,dtype=tf.float32))
print(feature_colunms)
# Making an input function to ddistribute our data into vatches,batch size and define the no.of epochs->
def make_input_fn(data_df, y_df, num_epochs=10,shuffle=True,batch_size=32):
def input_function():
ds = tf.data.Dataset.from_tensor_slices(dict(data_df),y_df)
if shuffle:
ds = ds.shuffle(1000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
# This would be returning a function object for use->
return input_function
# Finally preparing the objects for training and testing data that shall be fedd into our model->
training_data_input = make_input_fn(training_data,y_train)
testing_data_input = make_input_fn(testing_data,y_test,num_epochs=1,shuffle=True)
# Actuallu making the linear model :)
linear_model = tf.estimator.LinearClassifier(feature_columns=feature_colunms)
# Training the model built=>>
linear_model.train(training_data_input)
results = linear_model.evaluate(testing_data_input)
clear_output()
print(f"The accuracy of the model ia >> {results['accuracy']}")
The main error is showing here-
linear_model.train(training_data_input)
Related
I'm trying to implement a prediction using the Cleveland dataset from kaggle.com. I want to use shufflenet to implement as below in deep learning. All the examples I have seen use image datasets. I need guidance on how to go about this using non-image datasets like the Cleveland or SAHeart heart disease datasets.
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score
from keras.models import Sequential
from keras.layers import Dense
############# main ################
if __name__ == '__main__':
lc= "heart.csv"
dataset = pd.read_csv(lc)
predictors = dataset.drop("target",axis=1)
target = dataset["target"]
X_train,X_test,Y_train,Y_test = train_test_split(predictors,target,test_size=0.20,random_state=0)
model = Sequential()
model.add(Dense(11,activation='relu',input_dim=13))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train,Y_train,epochs=300)
#Predict
Y_pred_nn = model.predict(X_test)
rounded = [round(x[0]) for x in Y_pred_nn]
Y_pred_nn = rounded
score_nn = round(accuracy_score(Y_pred_nn,Y_test)*100,2)
precision_score_nn = precision_score(Y_pred_nn, Y_test, average=None)
#Print Accuracy score
print("The accuracy score achieved using Neural Network is: "+str(score_nn)+" %")
print("The precision score achieved using Neural Network is: "+str(precision_score_nn)+" %")
#importing packages
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
import numpy as np
import pandas as pd
from surprise.model_selection import train_test_split
#Load movielens-100k dataset
required_data = Dataset.load_builtin('ml-100k')
#Dividing the data in test and train
trainset, testingset = train_test_split(required_data,test_size=.25)
#initializing the SVD Algorithm
model_algo = SVD(n_factors=19)
#Fitting the model
model_algo.fit(trainset)
model_algo.qi.shape
model_algo.qi
what should i write further to view that [user x factors] and other matrices?
[user x latentfactors] matrix can be viewed using model_algo.pu
[latentfactors x Items] matrix can be viewed using model_algo.qi
Here is my code and I am loading the csv files from the drive
# -*- coding: utf-8 -*-
"""Titanic_Linear_Regression_Model.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1MMY9m7QGpqWVj-zyv2oaIJQZ2V__7AGX
"""
pip install -q sklearn
# Commented out IPython magic to ensure Python compatibility.
# %tensorflow_version 2.x
"""**Predicting Survivals of Titanic using linear regression model**
```
# This is formatted as code
```
Predicting Survivals of Titanic using linear regression model.
Load the data the titanic.csv data, perform analysis to gain better understaning of the data
Build the model, train and evaluate it
# New Section
"""
# Commented out IPython magic to ensure Python compatibility.
#import all necessary libraries
# %tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow as tf
#Install PyDrive
!pip install -U -q PyDrive
#import necessary modules for the PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
#authenticate and create a PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
#gender_submission_path = https://drive.google.com/file/d/1e0ZLmv8G-kVDJgeeb1b1REWJG0yNT3jc/view?...
#testing_data_path = https://drive.google.com/file/d/1SCaUViZG8qt1q5K5_cdBkFCM4VN8H_5z/view?...
#training_data_path= https://drive.google.com/file/d/1lLrhVKTGuQiJI5kTQkKF6if_r5B5S9HN/...
fileDownloaded = drive.CreateFile({"id":"1e0ZLmv8G-kVDJgeeb1b1REWJG0yNT3jc"})
fileDownloaded = drive.CreateFile({"id":"1SCaUViZG8qt1q5K5_cdBkFCM4VN8H_5z"})
fileDownloaded = drive.CreateFile({"id":"1lLrhVKTGuQiJI5kTQkKF6if_r5B5S9HN"})
#Loading the files
fileDownloaded.GetContentFile("gender_submission.csv")
fileDownloaded.GetContentFile("training_data.csv")
fileDownloaded.GetContentFile("testing_data.csv")
#Using proper encoding to read the csv files
df_train = pd.read_csv("training_data.csv", encoding='utf-8', quotechar='"', delimiter=',') #training data
df_test = pd.read_csv("testing_data.csv", encoding='utf-8', quotechar='"', delimiter=',') #test data
df_gender = pd.read_csv("gender_submission.csv", encoding='utf-8', quotechar='"', delimiter=',') #gender_submission dataset
#since our data contains NaN, lets convert them into empty strings
df_train.fillna('', inplace=True)
df_test.fillna('', inplace=True)
#removes Survival column from the train dataset and save it as y_train
y_train = df_train.pop('Survived')
#create a new column of train survival naming "y_train"
print(y_train.head())
#removes the Survived column from the test dataset as save as y_test
y_test = df_gender.pop('Survived')
print(y_test.head())
df_train.head() #prints overhead of the training datasets
"""**Analysis on the train dataset**"""
print(df_train.describe()) #print train dataset stats
print(df_train.shape) #prints the shape of the train data
#Generate histogram data for ages
#df_train.Age.hist(bins = 20)
#count sex to generate the graph of number of females and males
df_train['Sex'].value_counts().plot(kind = 'bar')
#generate a pie graph for the Pclass
df_train['Pclass'].value_counts().plot(kind = "pie")
#concatenate the train and y_train datasets and genergate a graph grouped by sex
pd.concat([df_train, y_train], axis = 1).groupby('Sex').Survived.mean().plot(kind = "bar").set_xlabel("% Survival")
#concatenate the train and y_train datasets and genergate a graph grouped by Age
pd.concat([df_train, y_train], axis = 1).groupby('Pclass').Survived.mean().plot(kind = "bar").set_xlabel("% Survival per class")
"""The data shows that the majority of the passengers were aged between 15 to 40 years and majority of the passengers were males. The data shows that most of the passengers were based in the passenger class 3. Females have much higher chance of survival than males, 70% of females survived compared to 20% of males. The data suggest that Passengers who were in class 1 had a higher chance of survival than those who were in class 2 and 3
**Create feature columns that will be used to feed the model**
"""
#feature columns that will be used to feed the model
categorical_columns = ["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked", "SibSp"]
numerical_columns = ["Age", "Fare"]
feature_columns = []
#gets a list of unique values
for feature_name in categorical_columns:
vocabulary = df_train[feature_name].unique()
feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary, dtype=None, default_value=-1, num_oov_buckets=0
))
print(feature_columns)
"""**Training the model using feature categorical_columns**
Create an input function that will convert the data into tf.data.Dataset
"""
#Create an input function that will convert the data into tf.data.Dataset
def make_input_fn(data_df, label_df, num_epochs = 10, shuffle = True, batch_size = 32):
#input fn to be returned by
def input_function():
#create tf.data.Dataset object with data and its label
ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
if shuffle:
ds.ds.shuffle(1000) #shuffles data 1000 times
ds = ds.batch(batch_size).repeat(num_epochs) #splits dataset into 32 batches and reeats the process 10 times
return ds #retursn a batch of a dataset
return input_function #returns function object for useage
#Calls the input function that we returned to create the train and test functions
train_input_fn = make_input_fn(df_train, y_train) #train function
test_input_fn = make_input_fn(df_test, y_test, num_epochs = 1, shuffle = False) #testing function
"""**Creating the Model**
Use linear estimator to utalize linear regression algorithm
"""
#creates the linear estimator by passing the feature_columns
linear_estimator = tf.estimator.LinearClassifier(feature_columns = feature_columns)
#Training the model by passing the training function
linear_estimator.train(train_input_fn)
The error message I am getting:
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
tensorflow/python/framework/fast_tensor_util.pyx in tensorflow.python.framework.fast_tensor_util.AppendObjectArrayToTensorProto()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/compat.py in as_bytes(bytes_or_text, encoding)
85 else:
86 raise TypeError('Expected binary or unicode string, got %r' %
---> 87 (bytes_or_text,))
88
89
TypeError: Expected binary or unicode string, got 22.0
enter image description here
Probably you are using the wrong encoding to read the csv files.
You can use the chardet module to try to detect the right encoding first - by reading first few bytes of the csv files.
import csv
import chardet
with open("training_data.csv", mode="rb") as ifile:
# may need to increase byte count from 32 for better accuracy
raw_bytes = ifile.read(32)
encoding_name = chardet.detect(raw_bytes)["encoding"]
Then, you can use this encoding in your read_csv():
pd.read_csv("training_data.csv", encoding=encoding_name, quotechar='"', delimiter=',')
I decided to remove the NaN rows in the data
#Check info for data type
print(df_train.info())
print("\n", df_test.info())
#Drop all the NaN
df_train.dropna(axis = 0, inplace = True)
df_test.dropna(axis = 0, inplace = True)
The error is solved
I was trying to use dask for kaggle fraud detection classification problem.
But, when I build the model, model predicts all the values as 1.
I am truly surprised, since there are 56,000 zeors and 92 ones in test data, still the model somehow predicts all values as ones.
I am obviously doing something wrong. How to use the model correctly?
MWE
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
import dask_ml
from dask_ml.xgboost import XGBClassifier
import collections
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
# set up cluster
client = Client(n_workers=4)
# load the data
ifile = "https://github.com/vermaji333/MLProject/blob/master/creditcard.zip?raw=true"
#!wget https://github.com/vermaji333/MLProject/blob/master/creditcard.zip?raw=true
#ifile = 'creditcard.zip'
ddf = dd.read_csv(ifile,compression='zip',
blocksize=None,
assume_missing=True)
# train-test split
target = 'Class'
Xtr, Xtx, ytr, ytx = train_test_split(
ddf.drop(target,axis=1),
ddf[target],
test_size=0.2,
random_state=100,
shuffle=True
)
# modelling
model = XGBClassifier(n_jobs=-1,
random_state=100,
scale_pos_weight=1, # default
objective='binary:logistic')
model.fit(Xtr,ytr)
ypreds = model.predict(Xtx)
ytx = ytx.compute()
ypreds = ypreds.compute()
# model evaluation
print(collections.Counter(ytx)) # Counter({0.0: 56607, 1.0: 92})
print(collections.Counter(ypreds)) # this gives all 1's
Update
I tried various values of scale pos weights.
I tried various scale_pos_weights
collections.Counter(ytr)
Counter({0.0: 227708, 1.0: 400})
scale_pos_weight= 227708/400
scale_pos_weight= 400/227708
scale_pos_weight= other values
But, for all parameters, I got all 1's as the result:
print(collections.Counter(ytx)) # Counter({0.0: 56607, 1.0: 92})
print(collections.Counter(ypreds)) # this gives all 1's
Counter({0.0: 56607, 1.0: 92})
Counter({1: 56699})
I created a multiple input one output LSTM that estimated the total price with a dataset of daily room rates for a hotel by month, but the model I created doesn't work well. Below I shared the model's code and the link to the data set.
data = pd.read_csv("/content/drive/My Drive/hotels.csv")
data
enter image description here
new_data = data.loc[:,['date','days','price','total']]
new_data.info()
date = new_data.date.values
dates = []
for i in date:
dates.append(i.split('/')[0])
new_data['date'] = dates
new_data
enter image description here
new_data = new_data.astype('float32')
new_data.info()
enter image description here
import pickle
filehandler = open(b"Hotels.obj","wb")
pickle.dump(new_data,filehandler)
file = open("/content/Hotels.obj",'rb')
object_file = pickle.load(file)
object_file
enter image description here
from math import sqrt
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Concatenate
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop,Adam
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import datetime
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from packaging import version
print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
"This notebook requires TensorFlow 2.0 or above."
file = open('/content/Hotels.obj', 'rb')
scaler = MinMaxScaler(feature_range=(0, 1))
train_size = int(len(object_file) * 0.76)
test_size = len(object_file) - train_size
days = object_file["days"].values.reshape(-1,1)
price = object_file["price"].values.reshape(-1,1)
total = object_file["total"].values.reshape(-1,1)
date = object_file["date"].values.reshape(-1,1)
days_ = scaler.fit_transform(days)
total_ = scaler.fit_transform(total)
price_ = scaler.fit_transform(price)
date_ = scaler.fit_transform(date)
days_train = days_[0:train_size].reshape(train_size,1,1)
days_test = days_[train_size:len(days_)].reshape(test_size,1,1)
date_train = date_[0:train_size].reshape(train_size,1,1)
date_test = date_[train_size:len(days_)].reshape(test_size,1,1)
price_train = price_[0:train_size].reshape(train_size,1,1)
price_test = price_[train_size:len(price_)].reshape(test_size,1,1)
total_train = total_[0:train_size].reshape(train_size,1)
total_test = total_[train_size:len(total_)].reshape(test_size,1)
def buildModel(dataLength,labelLength):
date = tf.keras.Input(shape=(1,1),name='date')
days = tf.keras.Input(shape=(1,1),name='days')
price = tf.keras.Input(shape=(1,1),name='price')
dateLayers = LSTM(100,return_sequences=False)(date)
daysLayers = LSTM(100,return_sequences=False)(days)
priceLayers = LSTM(100,return_sequences=False)(price)
output = tf.keras.layers.concatenate(inputs=[dateLayers,daysLayers, priceLayers],axis=1)
output = Dense(labelLength,activation='relu',name='weightedAverage_output_3')(output)
model = Model(inputs=[date,days,price],outputs=[output])
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
model.compile(optimizer=optimizer,loss='mse',metrics=['accuracy'])
return model
object_file = pickle.load(file)
logdir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
rnn = buildModel(train_size,1)
rnn.fit([date_train,days_train,price_train],
[total_train],
validation_data = ([date_test,days_test,price_test],[total_test]),
epochs = 1,
batch_size = 10,
callbacks=[tensorboard_callback]
)
result = rnn.predict([date_test,days_test,price_test])
scaler.inverse_transform(result)
enter image description here
When I increase the number of epoch, the model is being overfit.I can't get the result I want.How can I do this?
Data set link : https://www.kaggle.com/leomauro/argodatathon2019#hotels.csv
Your results are poor because your metrics is accuracy. If I understand correctly, you're predicting a continuous variable — you're not classifying. So, it makes no sense to look at accuracy.
Metrics should be mae for mean absolute error. I think you'll be satisfied with your model performance then.
Re-scaling your target makes no sense here. It's the inner workings of the neural network that prefer an input between 0 and 1.