Getting the indexes of a Dataframe after a numpy array function - python

I have a function which implements the k-mean algorithm and I want to use it with DataFrames in order to take into account indexes. For the moment I use DataFrame.values and it works. Yet I don't get the indexes of the output.
def cluster_points(X, mu):
clusters = {}
for x in X:
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
for i in enumerate(mu)], key=lambda t:t[1])[0]
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
return clusters
def reevaluate_centers(mu, clusters):
newmu = []
keys = sorted(clusters.keys())
for k in keys:
newmu.append(np.mean(clusters[k], axis = 0))
return newmu
def has_converged(mu, oldmu):
return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))
def find_centers(X, K):
# Initialize to K random centers
oldmu = random.sample(X, K)
mu = random.sample(X, K)
while not has_converged(mu, oldmu):
oldmu = mu
# Assign all points in X to clusters
clusters = cluster_points(X, mu)
# Reevaluate centers
mu = reevaluate_centers(oldmu, clusters)
return(mu, clusters)
For instance with thus example minimal and sufficient :
import itertools
df = pd.DataFrame(np.random.randint(0,10,size=(10, 5)), index = list(range(10)), columns=list(range(5)))
df.index.name = 'subscriber_id'
df.columns.name = 'ad_id'
I get :
find_centers(df.values, 2)
([array([ 3.8, 3. , 3.6, 2. , 3.6]),
array([ 6.8, 3.6, 5.6, 6.8, 6.8])],
{0: [array([2, 0, 5, 6, 4]),
array([1, 1, 2, 3, 3]),
array([6, 0, 4, 0, 3]),
array([7, 9, 4, 1, 7]),
array([3, 5, 3, 0, 1])],
1: [array([6, 2, 5, 9, 6]),
array([8, 9, 7, 2, 8]),
array([7, 5, 3, 7, 8]),
array([7, 1, 5, 7, 6]),
array([6, 1, 8, 9, 6])]})
I have the values but don't have the indexes.

If you want to get the array of values including the index, you can simply add the index to the columns with reset_index():
values_with_index = df.reset_index().values
Update
If what you want is to have the index on the output, but not use it during the actual clustering, you can do the following. First, pass the actual data frame object to find_centers:
find_centers(df, 2)
Then change cluster_points as follows:
def cluster_points(X, mu):
clusters = {}
for _, x in X.iterrows():
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]]))
for i in enumerate(mu)], key=lambda t:t[1])[0]
# You can replace this try/except block with
# clusters.setdefault(bestmukey, []).append(x)
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
return clusters
The centers in the output will still be arrays, but the clusters will contain series objects with each row. The name property of each of these series is the index value in the data frame.

Related

Generating a new array based on certain criterion in Python

I have an array X and a list T2. I want to create a new array Xnew such that elements of X are placed according to locations specified in T2. I present the current and expected outputs.
import numpy as np
X=np.array([4.15887486e+02, 3.52446375e+02, 2.81627790e+02, 1.33584716e+02,
6.32045703e+01, 2.07514659e+02, 1.00000000e-24])
T2=[0, 3, 5, 8, 9, 10, 11]
def make_array(indices, values):
rtrn = np.zeros(np.max(indices) + 1, dtype=values.dtype)
rtrn[indices] = values
return
Xnew = np.array([make_array(Ti, Xi) for Ti, Xi in zip([T2], X)], dtype=object)
print("New X =",[Xnew])
The current output is
New X = [array([None], dtype=object)]
The expected output is
[array([[4.15887486e+02, 0.0, 0.0, 3.52446375e+02, 0.0,
2.81627790e+02, 0.0, 0.0, 1.33584716e+02,
6.32045703e+01, 2.07514659e+02, 1.00000000e-24]],
dtype=object)]
You basically have what you need, but you are calling your function in a very weird way.
The function works with numpy arrays / lists as input, you don't need to put in individual elements.
X = np.arange(5)
ind = np.asarray([1, 4, 3, 2, 10])
def make_array(indices, values):
rtrn = np.zeros(np.max(indices) + 1, dtype=values.dtype)
rtrn[indices] = values
return rtrn
make_array(ind, X) # array([0, 0, 3, 2, 1, 0, 0, 0, 0, 0, 4])

Finding all the variables that give the highest Adjusted R squared value

I have a dataframe which stores different variables. I'm using OLS linear regression and using all of the variables to predict the 'price' column.
import pandas as pd
import statsmodels.api as sm
data = {'accommodates':[2, 2, 3, 2, 2, 6, 8, 4, 3, 2],
'bedrooms':[1, 2, 1, 1, 3, 4, 2, 2, 2, 3],
'instant_bookable':[1, 0, 1, 1, 1, 1, 0, 0, 0, 1],
'availability_365':[123, 3, 33, 14, 15, 16, 3, 41, 61, 74],
'minimum_nights':[3, 12, 1, 4, 6, 7, 2, 3, 6, 10],
'beds':[2, 2, 3, 4, 1, 5, 6, 2, 3, 2],
'price':[59, 234, 15, 162, 56, 42, 28, 52, 22, 31]}
df = pd.DataFrame(data, columns = ['accommodates', 'bedrooms', 'instant_bookable', 'availability_365',
'minimum_nights', 'beds', 'price'])
I have a for loop which calculates the Adjusted R squared value for each variable:
fit_d = {}
for columns in [x for x in df.columns if x != 'price']:
Y = df['price']
X = df[columns]
X = sm.add_constant(X)
model = sm.OLS(Y,X, missing = 'drop').fit()
fit_d[columns] = model.rsquared
fit_d
How can I modify my code in order to find the combination of variables that give the largest Adjusted R squared value? Ideally the function would find the variable with the largest adj. R squared value first, then using the 1st variable iterate with the remaining variables to get 2 variables that give the highest value, then 3 variables etc. until the value cannot be increased further. I'd like the output to be something like
Best variables: {'accommodates, 'availability', 'bedrooms'}
Here is a "brute force way" to do all possible combinations (from itertools) of different length to find the variables with higher R value. The idea is to do 2 loops, one for the number of variables to try, and one for all the combinations with the number of variables.
from itertools import combinations
# all possible columns for X
cols = [x for x in df.columns if x != 'price']
# define Y as same accross the loops
Y = df['price']
# define result dictionary
fit_d = {}
# loop for any length of combinations
for i in range(1, len(cols)+1):
# loop for any combinations with length i
for comb in combinations(cols, i):
# Define X from the combination
X = df[list(comb)]
X = sm.add_constant(X)
# perform the OLS opertion
model = sm.OLS(Y,X, missing = 'drop').fit()
# save the rsquared in a dictionnary
fit_d[comb] = model.rsquared
# extract the key for the max R value
key_max = max(fit_d, key=fit_d.get)
print(f'Best variables {key_max} for a R-value of {round(fit_d[key_max], 5)}')
# Best variables ('accommodates', 'bedrooms', 'instant_bookable', 'availability_365', 'minimum_nights', 'beds') for a R-value of 0.78506

I have written a code to calculate the correlation between two Pandas Series. Can you tell me what is wrong with my code?

Below is the code:
import numpy as np
import pandas as pd
def correlation(x, y):
std_x = (x - x.mean())/x.std(ddof = 0)
std_y = (y - y.mean())/y.std(ddof = 0)
return (std_x * std_y).mean
a = pd.Series([2, 4, 5, 7, 9])
b = pd.Series([12, 10, 9, 7, 3])
ca = correlation(a, b)
print(ca)
It does not return the value of the correlation, instead it returns a Series with keys as 0 ,1, 2, 3, 4, 5 and values as -1.747504, -0.340844, -0.043282, -0.259691, -2.531987.
Please help me understand the problem behind this.
You need to call mean() with:
return (std_x * std_y).mean()
not only :
return (std_x * std_y).mean:
which returns the method itself. Full code:
import numpy as np
import pandas as pd
def correlation(x, y):
std_x = (x - x.mean())/x.std(ddof = 0)
std_y = (y - y.mean())/y.std(ddof = 0)
return (std_x * std_y).mean()
a = pd.Series([2, 4, 5, 7, 9])
b = pd.Series([12, 10, 9, 7, 3])
ca = correlation(a, b)
print(ca)
Output:
-0.984661667628
You can also use scipy.stats.stats to calculate a Pearson correlation. At a minimum, you can use this as a quick check your algorithm is correct.
from scipy.stats.stats import pearsonr
import pandas as pd
a = pd.Series([2, 4, 5, 7, 9])
b = pd.Series([12, 10, 9, 7, 3])
pearsonr(a, b)[0] # -0.98466166762781315
It might be worth mentioning that you can also ask pandas directly to calculate the correlation between two series using corr which also allows you to specify the type of correlation:
a = pd.Series([2, 4, 5, 7, 9])
b = pd.Series([12, 10, 9, 7, 3])
a.corr(b)
will then return
-0.98466166762781315
You can apply corr also on a dataframe which calculates all pairwise correlations between your columns (as each column is perfectly correlated with itself, you see 1s on the diagonal):
pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 8]}).corr()
a b
a 1.000000 0.960769
b 0.960769 1.000000

Python - how to add and subtract elements in array

If I have an array, let's say: np.array([4,8,-2,9,6,0,3,-6]) and I would like to add the previous number to the next element, how do I do?
And every time the number 0 shows up the addition of elements 'restarts'.
An example with the above array, I should get the following output when I run the function:
stock = np.array([4,12,10,19,25,0,3,-3]) is the right output, if the above array is inserted in transactions.
def cumulativeStock(transactions):
# insert your code here
return stock
I can't think of a method to solving this problem. Any help would be very appreciated.
I believe you mean something like this?
z = np.array([4,8,-2,9,6,0,3,-6])
n = z == 0
[False False False False False True False False]
res = np.split(z,np.where(n))
[array([ 4, 8, -2, 9, 6]), array([ 0, 3, -6])]
res_total = [np.cumsum(x) for x in res]
[array([ 4, 12, 10, 19, 25]), array([ 0, 3, -3])]
np.concatenate(res_total)
[ 4 12 10 19 25 0 3 -3]
another vectorized solution:
import numpy as np
stock = np.array([4, 8, -2, 9, 6, 0, 3, -6])
breaks = stock == 0
tmp = np.cumsum(stock)
brval = numpy.diff(numpy.concatenate(([0], -tmp[breaks])))
stock[breaks] = brval
np.cumsum(stock)
# array([ 4, 12, 10, 19, 25, 0, 3, -3])
import numpy as np
stock = np.array([4, 12, 10, 19, 25, 0, 3, -3, 4, 12, 10, 0, 19, 25, 0, 3, -3])
def cumsum_stock(stock):
## Detect all Zero's first
zero_p = np.where(stock==0)[0]
## Create empty array to append final result
final_stock = np.empty(shape=[0, len(zero_p)])
for i in range(len(zero_p)):
## First Zero detection
if(i==0):
stock_first_part = np.cumsum(stock[:zero_p[0]])
stock_after_zero_part = np.cumsum(stock[zero_p[0]:zero_p[i+1]])
final_stock = np.append(final_stock, stock_first_part)
final_stock = np.append(final_stock, stock_after_zero_part)
## Last Zero detection
elif(i==(len(zero_p)-1)):
stock_last_part = np.cumsum(stock[zero_p[i]:])
final_stock = np.append(final_stock, stock_last_part, axis=0)
## Intermediate Zero detection
else:
intermediate_stock = np.cumsum(stock[zero_p[i]:zero_p[i+1]])
final_stock = np.append(final_stock, intermediate_stock, axis=0)
return(final_stock)
final_stock = cumsum_stock(stock).astype(int)
#Output
final_stock
Out[]: array([ 4, 16, 26, ..., 0, 3, 0])
final_stock.tolist()
Out[]: [4, 16, 26, 45, 70, 0, 3, 0, 4, 16, 26, 0, 19, 44, 0, 3, 0]
def cumulativeStock(transactions):
def accum(x):
acc=0
for i in x:
if i==0:
acc=0
acc+=i
yield acc
stock = np.array(list(accum(transactions)))
return stock
for your input np.array([4,8,-2,9,6,0,3,-6])
it returns
array([ 1, 3, 6, 9, 13, 0, 1, 3, 6])
I assume you mean you want to seperate the list at every zero?
from itertools import groupby
import numpy
def cumulativeStock(transactions):
#split list on item 0
groupby(transactions, lambda x: x == 0)
all_lists = [list(group) for k, group in groupby(transactions, lambda x: x == 0) if not k]
# cumulative the items
stock = []
for sep_list in all_lists:
for item in numpy.cumsum(sep_list):
stock.append(item)
return stock
print(cumulativeStock([4,8,-2,9,6,0,3,-6]))
Which will return:
[4, 12, 10, 19, 25, 3, -3]

How to make numpy.cumsum start after the first value

I have:
import numpy as np
position = np.array([4, 4.34, 4.69, 5.02, 5.3, 5.7, ..., 4])
x = (B/position**2)*dt
A = np.cumsum(x)
assert A[0] == 0 # I want this to be true.
Where B and dt are scalar constants. This is for a numerical integration problem with initial condition of A[0] = 0. Is there a way to set A[0] = 0 and then do a cumsum for everything else?
I don't understand what exactly your problem is, but here are some things you can do to have A[0] = 0.
You can create A to be longer by one index to have the zero as the first entry:
# initialize example data
import numpy as np
B = 1
dt = 1
position = np.array([4, 4.34, 4.69, 5.02, 5.3, 5.7])
# do calculation
A = np.zeros(len(position) + 1)
A[1:] = np.cumsum((B/position**2)*dt)
Result:
A = [ 0. 0.0625 0.11559096 0.16105356 0.20073547 0.23633533 0.26711403]
len(A) == len(position) + 1
Alternatively, you can manipulate the calculation to substract the first entry of the result:
# initialize example data
import numpy as np
B = 1
dt = 1
position = np.array([4, 4.34, 4.69, 5.02, 5.3, 5.7])
# do calculation
A = np.cumsum((B/position**2)*dt)
A = A - A[0]
Result:
[ 0. 0.05309096 0.09855356 0.13823547 0.17383533 0.20461403]
len(A) == len(position)
As you see, the results have different lengths. Is one of them what you expect?
1D cumsum
A wrapper around np.cumsum that sets first element to 0:
def cumsum(pmf):
cdf = np.empty(len(pmf) + 1, dtype=pmf.dtype)
cdf[0] = 0
np.cumsum(pmf, out=cdf[1:])
return cdf
Example usage:
>>> np.arange(1, 11)
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
>>> cumsum(np.arange(1, 11))
array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55])
N-D cumsum
A wrapper around np.cumsum that sets first element to 0, and works with N-D arrays:
def cumsum(pmf, axis=None, dtype=None):
if axis is None:
pmf = pmf.reshape(-1)
axis = 0
if dtype is None:
dtype = pmf.dtype
idx = [slice(None)] * pmf.ndim
# Create array with extra element along cumsummed axis.
shape = list(pmf.shape)
shape[axis] += 1
cdf = np.empty(shape, dtype)
# Set first element to 0.
idx[axis] = 0
cdf[tuple(idx)] = 0
# Perform cumsum on remaining elements.
idx[axis] = slice(1, None)
np.cumsum(pmf, axis=axis, dtype=dtype, out=cdf[tuple(idx)])
return cdf
Example usage:
>>> np.arange(1, 11).reshape(2, 5)
array([[ 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10]])
>>> cumsum(np.arange(1, 11).reshape(2, 5), axis=-1)
array([[ 0, 1, 3, 6, 10, 15],
[ 0, 6, 13, 21, 30, 40]])
I totally understand your pain, I wonder why Numpy doesn't allow this with np.cumsum. Anyway, though I'm really late and there's already another good answer, I prefer this one a bit more:
np.cumsum(np.pad(array, (1, 0), "constant"))
where array in your case is (B/position**2)*dt. You can change the order of np.pad and np.cumsum as well. I'm just adding a zero to the start of the array and calling np.cumsum.
You can use roll (shift right by 1) and then set the first entry to zero.

Categories

Resources