I am writting this code but appear this error:
THis is my Code found at https://github.com/statisticianinstilettos/recmetrics/blob/master/example.ipynb:
!pip install scipy
!pip install git+https://github.com/statisticianinstilettos/recmetrics
import pandas as pd
import numpy as np
import recmetrics
import matplotlib.pyplot as plt
from surprise import Reader, SVD, Dataset
from surprise.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
from zipfile import ZipFile
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
movielens_data_file_url = (
"http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
"ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"
# Only extract the data the first time the script is run.
if not movielens_dir.exists():
with ZipFile(movielens_zipped_file, "r") as zip:
# Extract files
print("Extracting all the files now...")
zip.extractall(path=keras_datasets_path)
print("Done!")
ratings_file = movielens_dir / "ratings.csv"
ratings = pd.read_csv(ratings_file)
ratings = ratings.query('rating >=3')
ratings.reset_index(drop=True, inplace=True)
#only consider ratings from users who have rated over n movies
n=1000
users = ratings.userId.value_counts()
users = users[users>n].index.tolist()
ratings = ratings.query('userId in #users')
print(ratings.shape)
ratings.head(3)
# get movie features
rated_movies = ratings.movieId.tolist()
movies_file = movielens_dir / "movies.csv"
movies = pd.read_csv(movies_file)
movies = movies.query('movieId in #rated_movies')
movies.set_index("movieId", inplace=True, drop=True)
movies = movies.genres.str.split("|", expand=True)
movies.reset_index(inplace=True)
movies = pd.melt(movies, id_vars='movieId', value_vars=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
movies.drop_duplicates("movieId", inplace=True)
movies.set_index('movieId', inplace=True)
movies = pd.get_dummies(movies.value)
#movies = movies[['Action', 'Romance', 'Western', 'Comedy', 'Crime']]
movies.head()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15, 7))
recmetrics.long_tail_plot(df=ratings,
item_id_column="movieId",
interaction_type="movie ratings",
percentage=0.5,
x_labels=False)
#format data for surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)
#train SVD recommender
algo = SVD()
algo.fit(trainset)
#make predictions on test set.
test = algo.test(testset)
test = pd.DataFrame(test)
test.drop("details", inplace=True, axis=1)
test.columns = ['userId', 'movieId', 'actual', 'cf_predictions']
test.head()
#evaluate model with MSE and RMSE
print(recmetrics.mse(test.actual, test.cf_predictions))
print(recmetrics.rmse(test.actual, test.cf_predictions))
def get_users_predictions(user_id, n, model):
recommended_items = pd.DataFrame(model.loc[user_id])
recommended_items.columns = ["predicted_rating"]
recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)
recommended_items = recommended_items.head(n)
return recommended_items.index.tolist()
#get example prediction
get_users_predictions(274, 10, cf_model)
# Error
test = test.copy().groupby('userId')['movieId'].agg({'actual': (lambda x: list(set(x)))})
Error:
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
<ipython-input-36-91830a4ef799> in <module>()
1 #format test data
----> 2 test = test.copy().groupby('userId')['movieId'].agg({'actual': (lambda x: list(set(x)))})
3
4
5
1 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg)
292 # GH 15931
293 if isinstance(self._selected_obj, Series):
--> 294 raise SpecificationError("nested renamer is not supported")
295
296 columns = list(arg.keys())
SpecificationError: nested renamer is not supported
I read that the brackets {} are no longer supported. The syntax was changed, unfortunately I can't find a solution that I can apply to my problem. How do I solve this problem?
Use named aggregation:
test = pd.DataFrame({
'movieId':[5,3,3,9,2,4,9],
'userId':list('aaabbbb')
})
test = test.groupby('userId').agg(actual = ('movieId', lambda x: list(set(x))))
print (test)
actual
userId
a [3, 5]
b [9, 2, 4]
Or use list of tuple:
test = test.groupby('userId')['movieId'].agg([('actual', lambda x: list(set(x)))])
print (test)
actual
userId
a [3, 5]
b [9, 2, 4]
Related
Using the iris dataset as a hypothetical hello world example:
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
print(df.head())
Let us say I want to use tf.keras.layers.Embedding instead of one-hot/dummy encoding as part of ANN for regression. e.g.:
iris_class_name + sepalwidthcm + petallengthcm -> sepallengthcm
where sepallengthcm is the dependent variable. I came across this:
city_lookup = tf.keras.layers.StringLookup(vocabulary = city_vocabulary, mask_token = None);
city_embedding= tf.keras.Sequential([
city_lookup,
tf.keras.layers.Embedding(len(city_vocabulary) + 1, embedding_dimension)
], "city_embedding")
city = features["city"]
city_embedding_output = city_embedding(city)
but am not sure how to exactly use it in my use case. Any pointers very much welcome. Thanks!
You can map iris_class_name to n-dimensional vector representations and then concatenate with the other continuous features:
import pandas as pd
from sklearn import datasets
import numpy as np
import tensorflow as tf
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['iris_class'] = pd.Series(iris['target'], name = 'target_values')
df['iris_class_name'] = df['iris_class'].replace([0,1,2], ['iris-' + species for species in iris['target_names'].tolist()])
df.columns = df.columns.str.replace("[() ]", "")
vocab = df['iris_class_name'].unique()
embedding_dimension = 10
lookup = tf.keras.layers.StringLookup(vocabulary = vocab, mask_token = None)
embedding= tf.keras.Sequential([
lookup,
tf.keras.layers.Embedding(len(vocab) + 1, embedding_dimension)
])
names = df['iris_class_name'].to_numpy()
embedding_output = embedding(names)
features = np.concatenate((embedding_output, df[['sepalwidthcm', 'petallengthcm']].to_numpy()), axis=-1)
print(features.shape)
(150, 12)
Since you have 3 unique iris class names, you could also simply create an integer-to-vector dictionary manually, but it is up to you.
so I am pretty new at Python, and I am trying to load a dataset from my computer using scikit. This is what my code looks like:
**whatever.py**
import numpy as np
import csv
from sklearn.datasets.base import Bunch
class Cortex_nuc:
def cortex_nuclear():
with open('C:/Users/User/Desktop/Data_Cortex_Nuclear4.csv') as csv_file:
data_file = csv.reader(csv_file)
temp = next(data_file)
n_samples = int(float(temp[0]))
n_features = int(float(temp[1]))
data = np.empty((n_samples, n_features))
target = np.empty((n_samples,), dtype=np.float64)
for i, sample in enumerate(data_file):
data[i] = np.asarray(sample[:-1], dtype=np.float64)
target[i] = np.asarray(sample[-1], dtype=np.float64)
return Bunch(data=data, target=target)
so then I import it into my project:
from whatever import Cortex_nuc
and after that I try to save it into df:
df = Cortex_nuc.cortex_nuclear()
Btw, this is what the dataset looks like:
this is just a part of the dataset, otherwise it has 77 columns and about a thousand rows.
But I get an error message and I can't seem to figure out why it's happening. Here's the error message:
IndexError Traceback (most recent call last)
<ipython-input-5-a4935f2c187f> in <module>
----> 1 df = Cortex_nuc.cortex_nuclear()
~\whatever.py in cortex_nuclear()
20
21 for i, sample in enumerate(data_file):
---> 22 data[i] = np.asarray(sample[:-1], dtype=np.float64)
23 target[i] = np.asarray(sample[-1], dtype=np.float64)
24
IndexError: index 0 is out of bounds for axis 0 with size 0
Can someone please help me? Thanks!
If you want to create a "sklearn-like" dataset in a Bunch object, you probably want something like this:
import pandas as pd
import numpy as np
from sklearn.utils import Bunch
# For reproducing
from io import StringIO
csv_file = StringIO("""
target,A,B
0,0,0
1,0,1
1,1,0
0,1,1
""")
def load_xor(*, return_X_y=False):
"""Describe your data here."""
_data_file = pd.read_csv(csv_file)
_data = Bunch()
_data["DESCR"] = load_xor.__doc__
_data["data"] = _data_file[["A", "B"]].to_numpy(dtype=np.float64)
_data["target"] = _data_file["target"].to_numpy(dtype=np.float64)
_data["target_names"] = np.array(["false", "true"])
_data["feature_names"] = np.array(list(_data_file.drop(["target"], axis=1)))
if return_X_y:
return _data.data, _data.target
return _data
if __name__ == "__main__":
# Return and unpack the `X`, `y` tuple
X, y = load_xor(return_X_y=True)
print(X, y)
This is because sklearn.datasets typically return Bunch objects with specific attributes/keys (for explanations, see the "Return" section of the load_iris documentation):
>>> from sklearn.datasets import load_iris
>>> data = load_iris()
>>> dir(data)
['DESCR', 'data', 'feature_names', 'filename', 'frame', 'target', 'target_names']
I am using ARIMA to do forecasting in Python, following are my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
HSBC = pd.read_csv('HSBC.csv', index_col = 'Date', parse_dates = True)
HSBC2 = HSBC['Close']
result = seasonal_decompose(HSBC2, model='multiplicative', period = 1)
from pmdarima import auto_arima
import warnings
warnings.filterwarnings("ignore")
stepwise_fit = auto_arima(HSBC2, start_p = 1, start_q = 1,
max_p = 3, max_q = 3, m = 12,
start_P = 0, seasonal = True,
d = None, D = 1, trace = True,
error_action ='ignore',
suppress_warnings = True,
stepwise = True)
train = HSBC2[0:173]
test = HSBC2[173:248]
model = SARIMAX(train, order = (0, 1, 1), seasonal_order =(0,1,1,12))
result = model.fit()
start = len(train)
end = len(train) + len(test) - 1
prediction = result.predict(start,end,
typ = 'levels').rename("Predictions")
predictions.plot(legend = True)
test.plot(legend = True)
I am confusing that why the x-axis of prediction plot become number, which supposed to be date like that of test plot.
If I am not wrong, this is due you have not specify the frequency of your index. Try this:
HSBC.index = pd.date_range(freq='d', start=HSBC.index[0], periods=len(HSBC)
Beware that you should frequency='d' if your index is daily spaced
EDIT:
So, the answer was just changing the parameters start and end parameters of the predict method, e.g:
start = test['Date'].iloc[0]
end = test['Date'].iloc[-1]
prediction = result.predict(start,end,
typ = 'levels').rename("Predictions")
It seems that my code fails when I try to set what headers/columns of data I want to use giving me an index error when trying to parse headers
import pandas as pd
import quandl
import math, datetime
import numpy as np
from sklearn import preprocessing , cross_validation, svm
from sklearn.linear_model import LinearRegression
import scipy
import matplotlib.pyplot as plt
from matplotlib import style
import pickle
style.use('ggplot')
df = pd.read_csv('convertcsv.csv',sep='\t')
df = np.array(df)
print(df)
df = df[['Open','High','Low','Close','Volume (BTC)']]
print("ok")
df['HL_PCT'] = (df['High'] - df['Close']) / df['Close'] * 100.0
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
df = df[['Close','HL_PCT','PCT_change','Volume (BTC)']]
forecast_col = 'Close'
df.fillna(-999999, inplace=True)
forecast_out = int(math.ceil(0.01*len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
X = np.array(df.drop(['label'],1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out:]
df.dropna(inplace=True)
y = np.array(df['label'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y,
test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
with open('linearregression.pickle','wb') as f:
pickle.dump(clf, f)
pickle_in = open('linearregression.pickle','rb')
clf =pickle.load(pickle_in)
accuracy = clf.score(X_test,y_test)
print(accuracy)
forecast_set = clf.predict(X_lately)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += one_day
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]
df['Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.pause(1)
plt.show()
print("we done?")`
...
I cant seem to figure out what I am doing wrong, it worked with the previous data set I was using, if it helps here is the format of the csv file that I was pulling from:
Timestamp,Open,High,Low,Close,Volume (BTC),Volume (Currency),Weighted Price
2017-09-30 00:00:00,4162.04,4177.63,4154.28,4176.08,114.81,478389.12,4166.96
2017-09-30 01:00:00,4170.84,4224.6,4170.84,4208.14,348.45,1463989.18,4201.4
I am not too experienced with this sort of stuff, and I tried to find other people with the same error but everyone was having a different sort of problem, I can include more data if it is needed.
You're converting your dataframe to a numpy array with df = np.array(df).
Don't expect a numpy array to function as a pandas dataframe.
Remove
df = np.array(df)
and you should be able to slice your matrix by column name with
df = df[['Open','High','Low','Close','Volume (BTC)']]
This error eludes me, because after running type(PCAdata), it returns <class 'numpy.ndarray'>. After reading about similar "Module" object is not callable errors, it seems to concern not importing the object itself from within the class, such as "from PCA import PCA". However, I'm already importing PCA from sklean.decomposition.
Here is my data: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
#Load CSV
filename = 'data.csv'
data = pd.read_csv(filename)
df = pd.DataFrame(data)
df=df.dropna(axis=1,how='all')
array = df.values
X = array[:,2:32]
Y = array[:, 1]
#Normalize Data
def normalize(df):
result = df.copy()
for feature_name in df.columns:
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result
df_normalized = normalize(df[ df.columns[2:32]])
pca = PCA(n_components = 16)
pca.fit_transform(df_normalized)
PCAdf = pd.DataFrame(pca.components_, columns = df_normalized.columns, index = ['PC-1','PC-2','PC-3','PC-4','PC-5','PC-6','PC-7','PC-8','PC-9','PC-10','PC-11','PC-12','PC-13','PC-14','PC-15','PC-16'])
PCAarray = PCAdf.values
#Convert all of the "M" class labels as 1, and "B" Labels as 0
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
df_v_y_encoded = encoder.transform(df_v_y)
#Train again, this time using features from principal component analysis.
classifierPCAfeatures = svm.SVC(gamma = "auto", C = 1, kernel = "rbf", decision_function_shape='ovo')
classifierPCAfeatures = fit(PCAdf, encoded_Y)
print(classifierPCAfeatures.score(df_v_x, df_v_y_encoded))