Modifying a numpy array after conversion from pandas dataframe - python

I have the following code which I am writing as part of a simple movie recommender in python so I can mimic the results I get as part of coursera's Machine Learning Course taught by Andrew NG.
I want to modify the numpy.ndarray that I get after calling as_matrix() on the pandas dataframe and add a column vector to it like we can in MATLAB
Y = [ratings Y]
Following is my python code
dataFile='/filepath/'
userItemRatings = pd.read_csv(dataFile, sep="\t", names=['userId', 'movieId', 'rating','timestamp'])
movieInfoFile = '/filepath/'
movieInfo = pd.read_csv(movieInfoFile, sep="|", names=['movieId','Title','Release Date','Video Release Date','IMDb URL','Unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'], encoding = "ISO-8859-1")
userMovieMatrix=pd.merge(userItemRatings, movieInfo, left_on='movieId', right_on='movieId')
userMovieSubMatrix = userMovieMatrix[['userId', 'movieId', 'rating','timestamp','Title']]
Y = pd.pivot_table(userMovieSubMatrix, values='rating', index=['movieId'], columns=['userId'])
Y.fillna(0,inplace=True)
movies = Y.shape[0]
users = Y.shape[1] +1
ratings = np.zeros((1682, 1))
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5
features = 10
theta = pd.DataFrame(np.random.rand(users,features))# users 943*3
X = pd.DataFrame(np.random.rand(movies,features))# movies 1682 * 3
X = X.as_matrix()
theta = theta.as_matrix()
Y = Y.as_matrix()
"""want to insert a column vector into this Y to get a new Y of dimension
1682*944, but only seeing 1682*943 after the following statement
"""
np.insert(Y, 0, ratings, axis=1)
R = Y.copy()
R[R!=0] = 1
Ymean = np.zeros((movies, 1))
Ynorm = np.zeros((movies, users))
for i in range(movies):
idx = np.where(R[i,:] == 1)[0]
Ymean[i] = Y[i,idx].mean()
Ynorm[i,idx] = Y[i,idx] - Ymean[i]
print(type(Ymean), type(Ynorm), type(Y), Y.shape)
Ynorm[np.isnan(Ynorm)] = 0.
Ymean[np.isnan(Ymean)] = 0.
There is an inline comment inserted, but my problem is when I create a new numpy array and call insert, it works just fine. However the numpy array I get after calling as_matrix() on pandas dataframe on which pivot_table() is called doesn't work. Is there any alternative?

insert does not operate in place, you need to assign the output to a variable. Try:
Y = np.insert(Y, 0, ratings, axis=1)

Related

Python's `.loc` is really slow on selecting subsets of Data

I'm having a large multindexed (y,t) single valued DataFrame df. Currently, I'm selecting a subset via df.loc[(Y,T), :] and create a dictionary out of it. The following MWE works, but the selection is very slow for large subsets.
import numpy as np
import pandas as pd
# Full DataFrame
y_max = 50
Y_max = range(1, y_max+1)
t_max = 100
T_max = range(1, t_max+1)
idx_max = tuple((y,t) for y in Y_max for t in T_max)
df = pd.DataFrame(np.random.sample(y_max*t_max), index=idx_max, columns=['Value'])
# Create Dictionary of Subset of Data
y1 = 4
yN = 10
Y = range(y1, yN+1)
t1 = 5
tN = 9
T = range(t1, tN+1)
idx_sub = tuple((y,t) for y in Y for t in T)
data_sub = df.loc[(Y,T), :] #This is really slow
dict_sub = dict(zip(idx_sub, data_sub['Value']))
# result, e.g. (y,t) = (5,7)
dict_sub[5,7] == df.loc[(5,7), 'Value']
I was thinking of using df.loc[(y1,t1),(yN,tN), :], but it does not work properly, as the second index is only bounded in the final year yN.
One idea is use Index.isin with itertools.product in boolean indexing:
from itertools import product
idx_sub = tuple(product(Y, T))
dict_sub = df.loc[df.index.isin(idx_sub),'Value'].to_dict()
print (dict_sub)

Using Scipy.optimize.curve_fit to fit an exponential

def Data_to_array(file):
r = int
x, y=[],[]
data = []
line_num = 0
#call data
P = open(file,'r')
data = P.readlines()
#Get it to ignore strings
for line in data:
line_num += 1
if line.find("[data]") >= 0:
r = (line_num+1)
# Data = P.readlines()[:r]
# print (Data)
if "Sampling Rate" in line:
SR = float(line[15:])
if "temperature=" in line:
T = float(line[12:18])
print(str("Temperature = "))
print(T)
Data = data[r:line_num]
#assign data into dataframe
df = pd.DataFrame(Data)
#rename column in data
df = df.rename(columns = {0: 'volts'})
#get it to recognise the index
df.index.name = 'Index'
#get it to recognise the data as number
df = df.astype({'volts': float})
#get index to start at 1
df.index += 1
#assign data to lists
I = df.index.to_list()
t = df['volts'].to_list()
#get it to invert data
y = [element * -1 for element in t]
#multiply by sampling rate
x = [element /(SR) for element in I]
return x,y
#This is to create the exponential function
def Exponential_func(file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
a,b = float()
#Y = temp_array[1]
f = np.exp(a*X) + b
return f
#This is to get the optomize function to work
def Exponential_model(file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
Y = np.asarray(temp_array[1])
#f = np.exp(X)
#exp_mod = lf.ExponentialModel(X,Y)
#pars = exp_mod.guess(Y, X)
r = sp.optimize.curve_fit(X,Y,Exponential_func.f)
return r
#This is to plot the data
def Plot_Data (file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
Y = np.asarray(temp_array[1])
#p_0 = np.exp(X)
#sp.optimize.curve_fit(X,Y,p_0)
plt.scatter(X,Y)
#plt.plot(Exponential_model.r)
plt.show()
plt.xlabel("Time (s)")
plt.ylabel("Capacitence (μF)")
# print(Data_to_array('Cz-Si-T-1.txt')[1])
Plot_Data("Cz-Si-T-82.txt")
Exponential_func("Cz-Si-T-82.txt")
Exponential_model("Cz-Si-T-82.txt")
When I try to use the sp.optomize function, I get the error "'function' object has no attribute 'f'" but when looking up this problem I have the function and variables in the correct order.
I need this piece of code to fit an exponential curve to my data, which does have an exp fit, can anyone help? It would also be helpful for the code to print the function of the fitted curve as I will be integrating under this later.

How to speed up data readings from dataframe columns in python?

My data frame looks like this
Good day,
I'm trying to read values of every row from 4 different columns in my data frame and store it in a single NumPy array (See attached picture). Each column has 150.000 data rows and the single NumPy array results on having 600.000 rows of data. I have to do it 4 times which means I have to create 4 x 600.000 long arrays. I used a basic for-loop in my Python code but it took almost 5 minutes to compute.
Does anyone know a better way to do this in order to improve its performance?
Thank you,
Here is my Python Code:
def oversampling(self):
# Oversampling restructuring
sh = self.df[['nSensor01_00']].values.shape
nSensor01 = np.zeros(shape=(sh[0] * 4, 1))
nSensor02 = np.zeros(shape=(sh[0] * 4, 1))
nSensor03 = np.zeros(shape=(sh[0] * 4, 1))
nSensor04 = np.zeros(shape=(sh[0] * 4, 1))
temp = np.arange(4, sh[0] * 4, 4)
ttime = np.arange(0, sh[0] / 500, 0.0005)
names = ['nSensor01', 'nSensor02', 'nSensor03', 'nSensor04']
for i in temp:
ind_begin = i - 4
ind_end = ind_begin + 4
a = int((i - 1) / 4)
nSensor01[ind_begin:ind_end ] = self.df.iloc[a, 55:59 ].values.flatten().reshape((4,1))
nSensor02[ind_begin:ind_end ] = self.df.iloc[a, 59:63 ].values.flatten().reshape((4,1))
nSensor03[ind_begin:ind_end ] = self.df.iloc[a, 63:67 ].values.flatten().reshape((4,1))
nSensor04[ind_begin :ind_end ] = self.df.iloc[a, 67:71 ].values.flatten().reshape((4,1))
d = np.hstack((nSensor01, nSensor02, nSensor03, nSensor04))
self.dfkHz = pd.DataFrame(data=d, columns=names)
self.dfkHz.insert(0, 'Time', ttime)
Does this work for you?
sh = self.df[['nSensor01_00']].values.shape
df_kHz = pd.DataFrame()
df_kHz["time"] = (np.arange(0, sh[0] / 500, 0.0005))
df_kHz["nSensor01"] = self.df.iloc[:, 55:59].values.flatten()
df_kHz["nSensor02"] = self.df.iloc[:, 59:63].values.flatten()
df_kHz["nSensor03"] = self.df.iloc[:, 63:67].values.flatten()
df_kHz["nSensor04"] = self.df.iloc[:, 67:71].values.flatten()

Down sample DF1 according to the coordinates in DF2

I have two DataFrames. Both have X and Y coordinates. But DF1 is much denser than DF2. I want to downsample DF1 according to the X Y coordinates in DF2. Specifically, for each X/Y pairs in DF2, I select DF1 data between X +/-delta and Y +/-delta, and calculate the average value of Z. New_DF1 will have the same X Y coordinate as DF2, but with the average value of Z by downsampling.
Here are some examples and a function I made for this purpose. My problem was that it is too slow for a large dataset. It is highly appreciated if anyone has a better idea to vectorize the operation instead of crude looping.
Create data examples:
DF1 = pd.DataFrame({'X':[0.6,0.7,0.9,1.1,1.3,1.8,2.1,2.8,2.9,3.0,3.3,3.5],"Y":[0.6,0.7,0.9,1.1,1.3,1.8,2.1,2.8,2.9,3.0,3.3,3.5],'Z':[1,2,3,4,5,6,7,8,9,10,11,12]})
DF2 = pd.DataFrame({'X':[1,2,3],'Y':[1,2,3],'Z':[10,20,30]})
Function:
def DF1_match_DF2_target(half_range, DF2, DF1):
### half_range, scalar, define the area of dbf target
### dbf data
### raw pwg pixel map
DF2_X =DF2.loc[:,["X"]]
DF2_Y =DF2.loc[:,['Y']]
results = list()
for i in DF2.index:
#Select target XY from DF2
x= DF2_X.at[i,'X']
y= DF2_Y.at[i,'Y']
#Select X,Y range for DF1
upper_lmt_X = x+half_range
lower_lmt_X = x-half_range
upper_lmt_Y = y+half_range
lower_lmt_Y = y-half_range
#Select data from DF1 according to X,Y range, calculate average Z
subset_X = DF1.loc[(DF1['X']>lower_lmt_X) & (DF1['X']<upper_lmt_X)]
subset_XY = subset_X.loc[(subset_X['Y']>lower_lmt_Y) & (subset_X['Y']<upper_lmt_Y)]
result = subset_XY.mean(axis=0,skipna=True)
result[0] = x #set X,Y in new_DF1 the same as the X,Y in DF2
result[1] = y #set X,Y in new_DF1 the same as the X,Y in DF2
results.append(result)
results = pd.DataFrame(results)
return results
Test and Result:
new_DF1 = DF1_match_DF2_target(0.5,DF2,DF1)
new_DF1
Test and Result
How about using the 'pandas:cut()' function to aggregate using the boundary values?
half_range = 0.5
# create bins
x_bins = [0] + list(df2.x)
y_bins = [0] + list(df2.y)
tmp = [half_range]*(len(df2)+1)
x_bins = [a + b for a, b in zip(x_bins, tmp)]
y_bins = [a + b for a, b in zip(y_bins, tmp)]
key = pd.cut(df1.x, bins=x_bins, right=False, precision=1)
df3 = df1.groupby(key).mean().reset_index()
df2.z = df3.z
df2
x y z
0 1 1 3.0
1 2 2 6.5
2 3 3 9.5

Statsmodels OLS with rolling window problem

I would like to do a regression with a rolling window, but I got only one parameter back after the regression:
rolling_beta = sm.OLS(X2, X1, window_type='rolling', window=30).fit()
rolling_beta.params
The result:
X1 5.715089
dtype: float64
What could be the problem?
Thanks in advance, Roland
I think the problem is that the parameters window_type='rolling' and window=30 simply do not do anything. First I'll show you why, and at the end I'll provide a setup I've got lying around for linear regressions on rolling windows.
1. The problem with your function:
Since you haven't provided some sample data, here's a function that returns a dataframe of a desired size with some random numbers:
# Function to build synthetic data
import numpy as np
import pandas as pd
import statsmodels.api as sm
from collections import OrderedDict
def sample(rSeed, periodLength, colNames):
np.random.seed(rSeed)
date = pd.to_datetime("1st of Dec, 1999")
cols = OrderedDict()
for col in colNames:
cols[col] = np.random.normal(loc=0.0, scale=1.0, size=periodLength)
dates = date+pd.to_timedelta(np.arange(periodLength), 'D')
df = pd.DataFrame(cols, index = dates)
return(df)
Output:
X1 X2
2018-12-01 -1.085631 -1.294085
2018-12-02 0.997345 -1.038788
2018-12-03 0.282978 1.743712
2018-12-04 -1.506295 -0.798063
2018-12-05 -0.578600 0.029683
.
.
.
2019-01-17 0.412912 -1.363472
2019-01-18 0.978736 0.379401
2019-01-19 2.238143 -0.379176
Now, try:
rolling_beta = sm.OLS(df['X2'], df['X1'], window_type='rolling', window=30).fit()
rolling_beta.params
Output:
X1 -0.075784
dtype: float64
And this at least represents the structure of your output too, meaning that you're expecting an estimate for each of your sample windows, but instead you get a single estimate. So I looked around for some other examples using the same function online and in the statsmodels docs, but I was unable to find specific examples that actually worked. What I did find were a few discussions talking about how this functionality was deprecated a while ago. So then I tested the same function with some bogus input for the parameters:
rolling_beta = sm.OLS(df['X2'], df['X1'], window_type='amazing', window=3000000).fit()
rolling_beta.params
Output:
X1 -0.075784
dtype: float64
And as you can see, the estimates are the same, and no error messages are returned for the bogus input. So I suggest that you take a look at the function below. This is something I've put together to perform rolling regression estimates.
2. A function for regressions on rolling windows of a pandas dataframe
df = sample(rSeed = 123, colNames = ['X1', 'X2', 'X3'], periodLength = 50)
def RegressionRoll(df, subset, dependent, independent, const, win, parameters):
"""
RegressionRoll takes a dataframe, makes a subset of the data if you like,
and runs a series of regressions with a specified window length, and
returns a dataframe with BETA or R^2 for each window split of the data.
Parameters:
===========
df: pandas dataframe
subset: integer - has to be smaller than the size of the df
dependent: string that specifies name of denpendent variable
inependent: LIST of strings that specifies name of indenpendent variables
const: boolean - whether or not to include a constant term
win: integer - window length of each model
parameters: string that specifies which model parameters to return:
BETA or R^2
Example:
========
RegressionRoll(df=df, subset = 50, dependent = 'X1', independent = ['X2'],
const = True, parameters = 'beta', win = 30)
"""
# Data subset
if subset != 0:
df = df.tail(subset)
else:
df = df
# Loopinfo
end = df.shape[0]
win = win
rng = np.arange(start = win, stop = end, step = 1)
# Subset and store dataframes
frames = {}
n = 1
for i in rng:
df_temp = df.iloc[:i].tail(win)
newname = 'df' + str(n)
frames.update({newname: df_temp})
n += 1
# Analysis on subsets
df_results = pd.DataFrame()
for frame in frames:
#print(frames[frame])
# Rolling data frames
dfr = frames[frame]
y = dependent
x = independent
if const == True:
x = sm.add_constant(dfr[x])
model = sm.OLS(dfr[y], x).fit()
else:
model = sm.OLS(dfr[y], dfr[x]).fit()
if parameters == 'beta':
theParams = model.params[0:]
coefs = theParams.to_frame()
df_temp = pd.DataFrame(coefs.T)
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
if parameters == 'R2':
theParams = model.rsquared
df_temp = pd.DataFrame([theParams])
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_temp.columns = [', '.join(independent)]
df_results = pd.concat([df_results, df_temp], axis = 0)
return(df_results)
df_rolling = RegressionRoll(df=df, subset = 50, dependent = 'X1', independent = ['X2'], const = True, parameters = 'beta',
win = 30)
Output: A dataframe with beta estimates for OLS of X2 on X1 for each 30 period window of the data.
const X2
Date
2018-12-30 0.044042 0.032680
2018-12-31 0.074839 -0.023294
2019-01-01 -0.063200 0.077215
.
.
.
2019-01-16 -0.075938 -0.215108
2019-01-17 -0.143226 -0.215524
2019-01-18 -0.129202 -0.170304

Categories

Resources