Using Custom Function with multiple parameters and partitioning/grouping by a column - python

Is it possible to do something like the following in python polars:
import polars as pl
import statsmodels.api as sm
lowess = sm.nonparametric.lowess
df = pl.DataFrame([pl.Series('x', ['a', 'a', 'a', 'b','b', 'b']),
pl.Series('y', [1, 2, 3, 1, 2, 3]),
pl.Series('z', [.2, .3, .5, .1, .3, .7])]
)
df.with_columns([
pl.struct(['z', 'y']).map(lambda cols: pl.DataFrame(lowess(cols['z'], cols['y'], frac = .1))).over('x')
])
I want to group by one or more columns and then apply a function with more than 1 argument.

With window function .over() you should use .apply() function. To reach specific field of struct you can use .struct.field() method.
def get_lowess(s: pl.Series) -> pl.Series:
return pl.Series(
lowess(s.struct.field("z"), s.struct.field("y"), frac = .1)
)
df.with_columns([
pl.struct(['z', 'y']).apply(get_lowess).over('x').alias("lowess")
])

Related

Would my algorithm benefit in terms of speed from vectorization and what could that look like?

I want to use the values in each row of the df as parameters to simulate data N times and store sum of simulated data somewhere.
The reproducible example works - but as N gets larger in combination with more rows in df it becomes very time consuming.
Is there a way to optimize? I have been looking into vectorization and maybe using multiple generators to pipeline the operations or list comprehension but am stuck.
import numpy as np
import pandas as pd
from pert import PERT
data = [[0.1, 0.14, 0.25, 50, 100, 150], [0.01, 0.03, 0.1, 200, 250, 300]]
df = pd.DataFrame(data, columns = ["A", "B", "C", "D", "E", "F"])
N = 1000
confidence = 4
empty_list = []
for _ in range(N):
df["P"] = df.apply(lambda x: PERT(x.A, x.B, x.C, confidence).rvs(1), axis = 1).astype(float)
df["O"] = df.apply(lambda x: np.random.binomial(1, x.P, 1), axis = 1).astype(int)
df["L"] = df.apply(lambda x: PERT(x.D, x.E, x.F, confidence).rvs(1) if x.O == 1 else 0, axis = 1).astype(int)
empty_list.append(sum(df["L"]))
df1 = pd.DataFrame(empty_list, columns = ["L"])
P50 = np.percentile(df1["L"], 50).astype(int)
My first answer was using a general attempt to vectorization, not knowing what PERT really was and where it was from. Assuming that PERT is from pertdist, there is a much shorter and easier way to vectorize the code. The other answer is still useful for people that come here with a slightly different problem.
PERT directly allows to draw more than one sample.
That makes things a lot easier.
import numpy as np
import pandas as pd
from pert import PERT # pertdist package
data = [[0.1, 0.14, 0.25, 50, 100, 150], [0.01, 0.03, 0.1, 200, 250, 300]]
df = pd.DataFrame(data, columns = ["A", "B", "C", "D", "E", "F"])
N = 1000
confidence = 4
P = PERT(df["A"], df["B"], df["C"], confidence).rvs((N, len(df)))
O = np.random.binomial(np.ones(len(df), dtype=int), P)
L = PERT(df["D"], df["E"], df["F"], confidence).rvs((N, len(df)))
L = np.where(O == 1, L, 0).astype(int)
data1 = np.sum(L, axis=1)
P50 = np.percentile(df1["L"], 50).astype(int)
This answer doesn't make use of the capability of PERT (from pertdist) to draw more than one sample at a time.
Using that allows to reduce the complexity of vectorization a lot.
This is outlined in my second answer.
This answer is still useful for people that come here with different problems that they want to vectorize, as it uses a more general approach.
I think you may get a nice speed-up from vectorization.
When vectorizing, you would
replace apply with computing "P", "O" and "L" each in a single go for the entire data frame
avoid the for loop, for example by using a 3D numpy array.
For step 1 you would do
import numpy as np
import pandas as pd
from pert import PERT # pertdist package
data = [
[0.1, 0.14, 0.25, 50, 100, 150],
[0.01, 0.03, 0.1, 200, 250, 300],
]
confidence = 4
df = pd.DataFrame(data, columns = ["A", "B", "C", "D", "E", "F"])
# Just pass None for size to get a vectorized version of PERT.
# I think this is undocumented behaviour of scipy.stats.beta,
# which PERT.rvs uses unter the hood
df["P"] = PERT(df["A"], df["B"], df["C"], confidence).rvs(None)
df["O"] = np.random.binomial(np.ones(len(df), dtype=int), df["P"])
L = PERT(df["D"], df["E"], df["F"], confidence).rvs(None)
df["L"] = np.where(df["O"] == 1, L, 0).astype(int) # replaces ... if ... else ...
sum(df["L"])
Now I have computed one result, but we need N. Doing this in one go is a bit more tricky. This is step 2. One idea is not to use a 2D data frame, but a 3D numpy array (a 3D tensor).
Luckily PERT also supports this.
import numpy as np
from pert import PERT # pertdist package
data = np.array([
[0.1, 0.14, 0.25, 50, 100, 150],
[0.01, 0.03, 0.1, 200, 250, 300],
])
N = 1000
confidence = 4
data_N = np.array([ # there is probably a faster way to do this
data
for _ in range(N)
])
print(data_N.shape) # 1000, 2, 6 => 3D array/tensor
A = data_N[:, :, 0]
B = data_N[:, :, 1]
C = data_N[:, :, 2]
D = data_N[:, :, 3]
E = data_N[:, :, 4]
F = data_N[:, :, 5]
P = PERT(A, B, C, confidence).rvs(None)
print(P.shape) # 1000, 2 => P values for each row of the original df, 1000 times
# independently drawing samples for each value in P
O = np.random.binomial(np.ones_like(P, dtype=int), P)
L = PERT(D, E, F, confidence).rvs(None)
L = np.where(O == 1, L, 0).astype(int) # replaces ... if ... else ...
data1 = np.sum(L, axis=1) # sum over L for each of the N samples
print(data1.shape) # 1000, => 1000 results
for percent in [10, 25, 50, 75, 90]:
print(np.percentile(data1, percent))

Adding two plots with differnts x start

I'm trying to merge two curves into only one, but the originals two lines are shifted horizontally:
import numpy as np
a = np.array([[1,2,3], # x value
[0.3, 0.5, 0.6]]) # y value
b = np.array([[2,3,4], [0.5,0.4,0.3]])
I need c = [[1,2,3,4],[0.3, 1, 1, 0.3]]
How can I done?
You can use pandas:
import pandas as pd
import numpy as np
a = np.array([[1,2,3], # x value
[0.3, 0.5, 0.6]]) # y value
b = np.array([[2,3,4], [0.5,0.4,0.3]])
pd.concat(map(pd.DataFrame,[a.T, b.T])).groupby(0).sum().plot()
Output:

Catboost - Unexpected behavior in the java api

I'm training a toy model in python, saving the model binary to disk, and then loading it into java (kotlin) and evaluating. My predictions don't agree between python and kotlin. Anyone know what I'm doing wrong?
import catboost as cb
import pandas as pd
x = pd.DataFrame(data={'a': [1, 3, 4, 99, 12],
'b': [0.5, 0, 1.3, 3, 44],
'c': [0.5, 0, 1.3, 0.91, 0],
'd': ['a', 'b', 'c', 'd', 'e']
})
y = pd.DataFrame(data={'y': [1.23, 3.2, 1.0, 1.5, 0.2]})
model = cb.CatBoostRegressor()
model.fit(x, y, cat_features=[3], verbose=False, plot=False)
proba = model.predict([1, 0.5, 0.5, 'c'])
print(proba) # 1.2274747745772228
model.save_model('./very_basic_model.cbm')
val model = CatBoostModel.loadModel('~/path/very_basic_model.cbm')
val floatFeatures = floatArrayOf(
1.0f,
0.5f,
0.5f
)
val categoricalFeatures:Array<String> = arrayOf("c")
val pred = model.predict(floatFeatures, categoricalFeatures).get(0,0)
System.out.println(pred) # -0.198525224469103
The answer is that the catboost java api wasn't correctly implemented until a fairly recent version. When I updated to version "0.24", I could confirm parity between python and java.

Is there a concise way to produce this dataframe mask?

I have a pandas DataFrame with a model score and order_amount_bucket. There are 8 bins in the order amount bucket and I have a different threshold for each bin. I want to filter the frame and produce a boolean mask showing which rows pass.
I can do this by exhaustively listing the conditions but I feel like there must be a more pythonic way to do this.
A small example of how I have made this work so far (with only 3 bins for simplicity).
import pandas as pd
sc = 'score'
amt = 'order_amount_bucket'
example_data = {sc:[0.5, 0.8, 0.99, 0.95, 0.8,0.8],
amt: [1, 2, 2, 2, 3, 1]}
thresholds = [0.7, 0.8, 0.9]
df = pd.DataFrame(example_data)
# the exhaustive method to create the pass mask
# is there a better way to do this part?
pass_mask = (((df[amt]==1) & (df[sc]<thresholds[0]))
|((df[amt]==2) & (df[sc]<thresholds[1]))
|((df[amt]==3) & (df[sc]<thresholds[2]))
)
pass_mask.values
>> array([ True, False, False, False, True, False])
You could covert thresholds to a dict and use Series.map:
d = dict(enumerate(thresholds, 1))
# d: {1: 0.7, 2: 0.8, 3: 0.9}
pass_mark = df['order_amount_bucket'].map(d) > df['score']
[out]
print(pass_mark.values)
array([ True, False, False, False, True, False])

Pandas median over grouped by binned data

I have a dataframe with users, score, times, where each user's different scores and the number of times they received it are listed:
user1, 1, 4
user1, 7, 2
user2, 3, 1
user2, 10, 2
and so on.
I'd like to calculate for each user the median of the scores.
For that I guess I should create a row-duplicated df, such as -
user1,1
user1,1
user1,1
user1,1
user1,7
user1,7
user2,3
user2,10
user2,10
and then use groupBy and apply to calculate the median somehow?
My questions -
Is this the correct approach? my df is very large so the solution has to be time efficient.
If this is indeed the way to go - can you please advise how? It keeps failing for me whatever I try to do.
I believe you need weighted median. I used function weighted_median from here, you can also try wquantile's weighted.median, but it interpolates in a bit different way so you may achieve nonexpected results):
import numpy as np
import pandas as pd
# from here: https://stackoverflow.com/a/32921444/3025981, CC BY-SA by Afshin # SE
def weighted_median(values, weights):
''' compute the weighted median of values list. The
weighted median is computed as follows:
1- sort both lists (values and weights) based on values.
2- select the 0.5 point from the weights and return the corresponding values as results
e.g. values = [1, 3, 0] and weights=[0.1, 0.3, 0.6] assuming weights are probabilities.
sorted values = [0, 1, 3] and corresponding sorted weights = [0.6, 0.1, 0.3] the 0.5 point on
weight corresponds to the first item which is 0. so the weighted median is 0.'''
#convert the weights into probabilities
sum_weights = sum(weights)
weights = np.array([(w*1.0)/sum_weights for w in weights])
#sort values and weights based on values
values = np.array(values)
sorted_indices = np.argsort(values)
values_sorted = values[sorted_indices]
weights_sorted = weights[sorted_indices]
#select the median point
it = np.nditer(weights_sorted, flags=['f_index'])
accumulative_probability = 0
median_index = -1
while not it.finished:
accumulative_probability += it[0]
if accumulative_probability > 0.5:
median_index = it.index
return values_sorted[median_index]
elif accumulative_probability == 0.5:
median_index = it.index
it.iternext()
next_median_index = it.index
return np.mean(values_sorted[[median_index, next_median_index]])
it.iternext()
return values_sorted[median_index]
# end from
def wmed(group):
return weighted_median(group['score'], group['times'])
import pandas as pd
df = pd.DataFrame([
['user1', 1, 4],
['user1', 7, 2],
['user2', 3, 1],
['user2', 10, 2]
], columns = ['user', 'score', 'times'])
groups = df.groupby('user')
groups.apply(wmed)
# user
# user1 1
# user2 10
# dtype: int64
df = pd.DataFrame({'user': ['user1', 'user1', 'user2', 'user2'],
'score': [1, 7, 3, 10],
'times': [4, 2, 1, 2]})
# Create dictionary of empty lists keyed on user.
scores = {user: [] for user in df.user.unique()}
# Expand list of scores for each user using a list comprehension.
_ = [scores[row.user].extend([row.score] * row.times) for row in df.itertuples()]
>>> scores
{'user1': [1, 1, 1, 1, 7, 7], 'user2': [3, 10, 10]}
# Now you can use a dictionary comprehension to calculate the median score of each user.
>>> {user: np.median(scores[user]) for user in scores}
{'user1': 1.0, 'user2': 10.0}

Categories

Resources