import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
df = pd.read_csv('CarSeats_Dataset.csv')
dummies=pd.get_dummies(df[['ShelveLoc', 'Urban', 'US']])
X = df.drop('Sales',axis=1)
y = np.log(df['Sales'])
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
regressor = DecisionTreeRegressor(random_state = 42), y_train)
I was trying to predict the Sales but when tried to fit the regressor I got the error : <ValueError: could not convert string to float: 'Bad'/>
I am a beginner in this and I do not know how to fix it. Can anyone help me with that please?
import pandas as pd
Data = {'Product': ['ABC','XYZ'],
'Price': ['250','270']}
df = pd.DataFrame(Data)
df['Price'] = df['Price'].astype(float)
print (df)
print (df.dtypes)
Following is my code
I am running it on IDLE python 3.8
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn import trees
from sklearn.metrics import accuracy_score,classification_report
import warnings
from sklearn.preprocessing import StandardScalar
from sklearn.neural_networks import MLPClassifier
print('Train set')
print('Test set')
print('Test Data')
It shows me error for these particular lines
It only says invalid syntax error
packages are imported correctly
Your code contains multiple issues:
The import should be StandardScaler not StandardScalar,
You got unused imports like MLPClassifier,
cols_to_retrain is empty. Thus, data[cols_to_retrain] will return an empty data frame,
to_dict should be to_dict(),
variable names x-feature and x_feature do not match,
LabelEncoder is missing brackets (),
x_train=[:-1] and x_test=[-1:] is not valid. You probably wanted to select a subset like x_train = x_vector[:-1] or x_test = x_vector[-1:]. Please add additional sample data, if you need help with this selection.
Here is an updated version of your code:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
data = pd.read_csv("data.csv")
cols_to_retain = []
x_feature = data[cols_to_retain]
x_dict = x_feature.T.to_dict().values()
vect = DictVectorizer(sparse=False)
x_vector = vect.fit_transform(x_dict)
x_train = x_vector[:-1]
x_test = x_vector[-1:]
print("Train set")
print("Test set")
le = LabelEncoder()
y_train = le.fit_transform(data["Goal"][:-1])
clf = DecisionTreeClassifier(criteron="entropy")
clf = clf.fit_transform(x_train, y_train)
print("Test Data")
I am working on an assignment and I run into this error. I am using python to perform an KNN on a data set. I pretty sure I defined the variable but it says otherwise. This code is written below.
import pandas as PD
import numpy as np
import matplotlib.pyplot as mtp
data_set= PD.read_csv('')
x= data_set.iloc[:,[2,3]].valuesS
y= data_set.iloc[:, 4].values
from sklearn.model_selection import train_test_split
x_train, x_train, y_train, y_train= train_test_split(x,y, test_size=.25, random_state=0)
from sklearn.preprocessing import StandardScaler
st_x= StandardScaler()
x_train= st_x.fit_transform(x_train)
x_test= st_x.transform(x_test)
import pandas as PD
import numpy as np
import matplotlib.pyplot as mtp
data_set= PD.read_csv('')
x= data_set.iloc[:,[2,3]].valuesS
y= data_set.iloc[:, 4].values
from sklearn.model_selection import train_test_split
x_train, x_train, y_train, y_train= train_test_split(x,y, test_size=.25, random_state=0)
from sklearn.preprocessing import StandardScaler
st_x= StandardScaler()
x_train= st_x.fit_transform(x_train)
x_test= st_x.transform(x_test)
The error says "x_test" is not defined Pylance (reportUndefinedVarible)
I was trying to understand differences between OneHotEncoder and get_dummies from this link: enter link description here
When I wrote exact same code, I am getting an error and it says
AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names_out'
Here is the code:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = sns.load_dataset('tips')
df = df[['total_bill', 'tip', 'day', 'size']]
X = df.drop('tip', axis=1)
y = df['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype='int')[['day']])
def get_ohe(df):
temp_df = pd.DataFrame(data=ohe.transform(df[['day']]), columns=ohe.get_feature_names_out())
df.drop(columns=['day'], axis=1, inplace=True)
df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
return df
X_train = get_ohe(X_train)
X_test = get_ohe(X_test)
I checked OneHotEncoder from sklearn.preprocessing module and get_feature_names_out() method is there and it is not deprecated. I don't know why I am getting this error.
If you're using scikit-learn version lower than 1.0, you need to use get_feature_names method. For newer versions of scikit-learn, get_feature_names_out will work fine.
This is my code.
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, f1_score, roc_auc_score
import warnings; warnings.simplefilter('ignore')
data_files = 'dataset_for_learning_decision_tree.xlsx'
data = pd.read_excel(data_files)
train_data = data[['title','category','processed_title']]
X_train, X_test, y_train, y_test = train_test_split(train_data['processed_title'],train_data['category'],test_size=0.2,random_state=57)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_train)
model = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer()),
('clf', decisiontree),
predicted = model.predict(X_test)
import numpy as np
from mlxtend.plotting import plot_decision_regions
I want to draw a plot_decision_region.
However, when I executed this code, I got the same error as the title.
When running with y=y.astype(np.integer), I get errors such as ValueError: invalid literal for int() with base 10: 'depression'. How should I fix it?
Convert the class labels to integers first,
import numpy as np
from mlxtend.plotting import plot_decision_regions
X = np.array(X_train)
y = np.array(y_train)
d = {'addiction':0, 'depression':1, 'normal':2}
y = list(map(lambda i : d[i], y))
I am trying to train a Linear Regression Qualifier to continue a grap.
I have a couple of thousand lines of data in my csv file that I import into numpy arrays. Here is my code :
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import csv
import math
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
def predict():
sample_data = pd.read_csv("includes\\csv.csv")
x = np.array(sample_data["day"])
y = np.array(sample_data["balance"])
for x in x:
x = x.reshape(1, -1)
for y in y:
y.reshape(1, -1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
clf = LinearRegression(), y_train)
clf.score(x_test, y_test)
When I run this, the error is:
TypeError: Singleton array 6014651 cannot be considered a valid collection.
Any ideas why that's a thing?
After discussion in comments:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import csv
import math
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
def predict():
sample_data = pd.read_csv("includes\\csv.csv")
x = np.array(sample_data["day"])
y = np.array(sample_data["balance"])
x = x.reshape(-1,1)
y = y.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
clf = LinearRegression(), y_train)
clf.score(X_test, y_test)
X_train, X_test should be capitals, python variables are case sensitive