creating a dataset function using scikit-learn - python

so I am pretty new at Python, and I am trying to load a dataset from my computer using scikit. This is what my code looks like:
**whatever.py**
import numpy as np
import csv
from sklearn.datasets.base import Bunch
class Cortex_nuc:
def cortex_nuclear():
with open('C:/Users/User/Desktop/Data_Cortex_Nuclear4.csv') as csv_file:
data_file = csv.reader(csv_file)
temp = next(data_file)
n_samples = int(float(temp[0]))
n_features = int(float(temp[1]))
data = np.empty((n_samples, n_features))
target = np.empty((n_samples,), dtype=np.float64)
for i, sample in enumerate(data_file):
data[i] = np.asarray(sample[:-1], dtype=np.float64)
target[i] = np.asarray(sample[-1], dtype=np.float64)
return Bunch(data=data, target=target)
so then I import it into my project:
from whatever import Cortex_nuc
and after that I try to save it into df:
df = Cortex_nuc.cortex_nuclear()
Btw, this is what the dataset looks like:
this is just a part of the dataset, otherwise it has 77 columns and about a thousand rows.
But I get an error message and I can't seem to figure out why it's happening. Here's the error message:
IndexError Traceback (most recent call last)
<ipython-input-5-a4935f2c187f> in <module>
----> 1 df = Cortex_nuc.cortex_nuclear()
~\whatever.py in cortex_nuclear()
20
21 for i, sample in enumerate(data_file):
---> 22 data[i] = np.asarray(sample[:-1], dtype=np.float64)
23 target[i] = np.asarray(sample[-1], dtype=np.float64)
24
IndexError: index 0 is out of bounds for axis 0 with size 0
Can someone please help me? Thanks!

If you want to create a "sklearn-like" dataset in a Bunch object, you probably want something like this:
import pandas as pd
import numpy as np
from sklearn.utils import Bunch
# For reproducing
from io import StringIO
csv_file = StringIO("""
target,A,B
0,0,0
1,0,1
1,1,0
0,1,1
""")
def load_xor(*, return_X_y=False):
"""Describe your data here."""
_data_file = pd.read_csv(csv_file)
_data = Bunch()
_data["DESCR"] = load_xor.__doc__
_data["data"] = _data_file[["A", "B"]].to_numpy(dtype=np.float64)
_data["target"] = _data_file["target"].to_numpy(dtype=np.float64)
_data["target_names"] = np.array(["false", "true"])
_data["feature_names"] = np.array(list(_data_file.drop(["target"], axis=1)))
if return_X_y:
return _data.data, _data.target
return _data
if __name__ == "__main__":
# Return and unpack the `X`, `y` tuple
X, y = load_xor(return_X_y=True)
print(X, y)
This is because sklearn.datasets typically return Bunch objects with specific attributes/keys (for explanations, see the "Return" section of the load_iris documentation):
>>> from sklearn.datasets import load_iris
>>> data = load_iris()
>>> dir(data)
['DESCR', 'data', 'feature_names', 'filename', 'frame', 'target', 'target_names']

Related

'DataFrame' object has no attribute 'flush'

I'm trying to solve Boston house price prediction problem,but it has this error
AttributeError: 'DataFrame' object has no attribute 'flush'
and this:
`
Cell In [53], line 7, in load_data()
5 def load_data():
6 datafile= pd.read_csv("housing.csv",sep=',')
----> 7 data = np.fromfile(datafile)
8 feature_names = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
9 feature_num = len(feature_names)
`
here's a part of my code
`
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def load_data():
datafile= pd.read_csv("housing.csv",sep=',')
data = np.fromfile(datafile)
feature_names = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
feature_num = len(feature_names)
data = data.reshape(data.shape[0] // feature_num, feature_num)
ratio = 0.8
offset = int(data.shape[0] * ratio)
training = data[:offset]
maximums, minimums, avge = training.max(axis=0), training.min(axis=0), training.sum(axis=0) / training.shape[0]
`
the word "flush" doesn't appear in my code or in my data
can anyone give me some idea?
You are reading the housing.csv file with pd.read_csv, which converts it to a Dataframe object. This leads to the error, because np.fromfile expects a file (str or path), not a Dataframe.
To get rid of the error, replace the first to statements in the load_data function with a single suitable numpy function such as np.genfromtext.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def load_data():
data = np.genfromtxt('housing.csv', delimiter=',')
feature_names = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
# [...]

Nested renamer not supported , how do I rebuild this code?

I am writting this code but appear this error:
THis is my Code found at https://github.com/statisticianinstilettos/recmetrics/blob/master/example.ipynb:
!pip install scipy
!pip install git+https://github.com/statisticianinstilettos/recmetrics
import pandas as pd
import numpy as np
import recmetrics
import matplotlib.pyplot as plt
from surprise import Reader, SVD, Dataset
from surprise.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
from zipfile import ZipFile
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
movielens_data_file_url = (
"http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
"ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"
# Only extract the data the first time the script is run.
if not movielens_dir.exists():
with ZipFile(movielens_zipped_file, "r") as zip:
# Extract files
print("Extracting all the files now...")
zip.extractall(path=keras_datasets_path)
print("Done!")
ratings_file = movielens_dir / "ratings.csv"
ratings = pd.read_csv(ratings_file)
ratings = ratings.query('rating >=3')
ratings.reset_index(drop=True, inplace=True)
#only consider ratings from users who have rated over n movies
n=1000
users = ratings.userId.value_counts()
users = users[users>n].index.tolist()
ratings = ratings.query('userId in #users')
print(ratings.shape)
ratings.head(3)
# get movie features
rated_movies = ratings.movieId.tolist()
movies_file = movielens_dir / "movies.csv"
movies = pd.read_csv(movies_file)
movies = movies.query('movieId in #rated_movies')
movies.set_index("movieId", inplace=True, drop=True)
movies = movies.genres.str.split("|", expand=True)
movies.reset_index(inplace=True)
movies = pd.melt(movies, id_vars='movieId', value_vars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
movies.drop_duplicates("movieId", inplace=True)
movies.set_index('movieId', inplace=True)
movies = pd.get_dummies(movies.value)
#movies = movies[['Action', 'Romance', 'Western', 'Comedy', 'Crime']]
movies.head()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15, 7))
recmetrics.long_tail_plot(df=ratings,
item_id_column="movieId",
interaction_type="movie ratings",
percentage=0.5,
x_labels=False)
#format data for surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)
#train SVD recommender
algo = SVD()
algo.fit(trainset)
#make predictions on test set.
test = algo.test(testset)
test = pd.DataFrame(test)
test.drop("details", inplace=True, axis=1)
test.columns = ['userId', 'movieId', 'actual', 'cf_predictions']
test.head()
#evaluate model with MSE and RMSE
print(recmetrics.mse(test.actual, test.cf_predictions))
print(recmetrics.rmse(test.actual, test.cf_predictions))
def get_users_predictions(user_id, n, model):
recommended_items = pd.DataFrame(model.loc[user_id])
recommended_items.columns = ["predicted_rating"]
recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)
recommended_items = recommended_items.head(n)
return recommended_items.index.tolist()
#get example prediction
get_users_predictions(274, 10, cf_model)
# Error
test = test.copy().groupby('userId')['movieId'].agg({'actual': (lambda x: list(set(x)))})
Error:
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
<ipython-input-36-91830a4ef799> in <module>()
1 #format test data
----> 2 test = test.copy().groupby('userId')['movieId'].agg({'actual': (lambda x: list(set(x)))})
3
4
5
1 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg)
292 # GH 15931
293 if isinstance(self._selected_obj, Series):
--> 294 raise SpecificationError("nested renamer is not supported")
295
296 columns = list(arg.keys())
SpecificationError: nested renamer is not supported
I read that the brackets {} are no longer supported. The syntax was changed, unfortunately I can't find a solution that I can apply to my problem. How do I solve this problem?
Use named aggregation:
test = pd.DataFrame({
'movieId':[5,3,3,9,2,4,9],
'userId':list('aaabbbb')
})
test = test.groupby('userId').agg(actual = ('movieId', lambda x: list(set(x))))
print (test)
actual
userId
a [3, 5]
b [9, 2, 4]
Or use list of tuple:
test = test.groupby('userId')['movieId'].agg([('actual', lambda x: list(set(x)))])
print (test)
actual
userId
a [3, 5]
b [9, 2, 4]

VIF function returns all 'inf' values

I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.

how can i fix the 'syntax error' in python using jupyter notebook?

hi tried to fix the error but i could not and i dont know where im going wrong can anyone please help . below is my code
my previous error was indentation error
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn as s
import matplotlib
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from IPython.display import display
df = pd.read_csv("C:/Users/patel/Desktop/tap.csv")
from IPython.display import display
X_all = df.drop(['FTR'],1)
y_all = df['FTR']
# Standardising the data.
from sklearn.preprocessing import scale
#Center to the mean and component wise scale to unit variance.
cols = [['FTHG','FTAG','HTHG','HTAG']]
for col in cols:
X_all[col] = scale(X_all[col])
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')
def preprocess_features(X):
output = pd.DataFrame(index = X.index)
for col, col_df in X.iteritems():
if col_df.dtype == object:
col_df = pd.get_dummies(col_df, prefix = col)
output = output.join(col_df)
return output
X_all = preprocess_features(X_all)
print "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))
print "\nFeature values:"
display (X_all)
File "", line 39
print "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))
^
SyntaxError: invalid syntax
If you are using Python 3, then the parentheses in print function are missing. The following code should work.
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

mean() got an unexpected keyword argument 'dtype'!

I am trying to implement image classification using Intel Bigdl. It is using mnist dataset for classification. Since, I don't want to use the mnist dataset I wrote the alternative approach to it as below:
Image Utils.py
from StringIO import StringIO
from PIL import Image
import numpy as np
from bigdl.util import common
from bigdl.dataset import mnist
from pyspark.mllib.stat import Statistics
def label_img(img):
word_label = img.split('.')[-2].split('/')[-1]
print word_label
# conversion to one-hot array [cat,dog]
# [much cat, no dog]
if "jobs" in word_label: return [1,0]
# [no cat, very doggo]
elif "zuckerberg" in word_label: return [0,1]
# target is start from 0,
def get_data(sc,path):
img_dir = path
train = sc.binaryFiles(img_dir + "/train")
test = sc.binaryFiles(img_dir+"/test")
image_to_array = lambda rawdata: np.asarray(Image.open(StringIO(rawdata)))
train_data = train.map(lambda x : (image_to_array(x[1]),np.array(label_img(x[0]))))
test_data = test.map(lambda x : (image_to_array(x[1]),np.array(label_img(x[0]))))
train_images = train_data.map(lambda x : x[0])
test_images = test_data.map((lambda x : x[0]))
train_labels = train_data.map(lambda x : x[1])
test_labels = test_data.map(lambda x : x[1])
training_mean = np.mean(train_images)
training_std = np.std(train_images)
rdd_train_images = sc.parallelize(train_images)
rdd_train_labels = sc.parallelize(train_labels)
rdd_test_images = sc.parallelize(test_images)
rdd_test_labels = sc.parallelize(test_labels)
rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map(lambda (features, label):
common.Sample.from_ndarray(
(features - training_mean) / training_std,
label + 1))
rdd_test_sample = rdd_test_images.zip(rdd_test_labels).map(lambda (features, label):
common.Sample.from_ndarray(
(features - training_mean) / training_std,
label + 1))
return (rdd_train_sample, rdd_test_sample)
Now when I try to get the data using the real image as below:
Classification.py
import pandas
import datetime as dt
from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.dataset import mnist
from imageUtils import get_data
from StringIO import StringIO
from PIL import Image
import numpy as np
init_engine()
path = "/home/fusemachine/Hyper/person"
(train_data, test_data) = get_data(sc,path)
print train_data.count()
print test_data.count()
I get the following error
TypeError Traceback (most recent call >last)
in ()
2 # Get and store MNIST into RDD of Sample, please edit the "mnist_path" accordingly.
3 path = "/home/fusemachine/Hyper/person"
----> 4 (train_data, test_data) = get_data(sc,path)
5 print train_data.count()
6 print test_data.count()
/home/fusemachine/Downloads/dist-spark-2.1.0-scala-2.11.8-linux64-0.1.1-dist/imageUtils.py in get_data(sc, path)
31 test_labels = test_data.map(lambda x : x[1])
---> 33 training_mean = np.mean(train_images)
34 training_std = np.std(train_images)
35 rdd_train_images = sc.parallelize(train_images)
/opt/anaconda3/lib/python2.7/site-packages/numpy/core/fromnumeric.pyc in mean(a, axis, dtype, out, keepdims)
2884 pass
2885 else:
-> 2886 return mean(axis=axis, dtype=dtype, out=out, **kwargs)
2887
2888 return _methods._mean(a, axis=axis, dtype=dtype,
TypeError: mean() got an unexpected keyword argument 'dtype'
I could not figure out the solution for this. Also is there any other alternative of mnist dataset. So that we can directly process the real Image ?
Thank you
The train_images is a rdd and you can't apply numpy mean on a rdd. one way is to do collect() and over that apply numpy mean,
train_images = train_data.map(lambda x : x[0]).collect()
training_mean = np.mean(train_images)
or rdd.mean()
training_mean = train_images.mean()

Categories

Resources