I'm trying to apply a function to each segment/partition that is the result of a groupby operation.
def get_click_rate(data):
click_count = data[data['event'] == 'click'].shape[0]
view_count = data[data['event'] == 'pageview'].shape[0]
return click_count / view_count
data.groupby('linkid').apply(get_click_rate).reset_index(name='click rate')
The idea here is that I'm grouping the dataframe by LinkID of a webpage, and then each partition I'm passing to a function that filters the sub-dataframe, computes a number and returns it. However it returns wrong figures. Here is the snippet that returns the right figures
click_event = data[data['event'] == 'click'].groupby('linkid')['event'].count().reset_index(name='click count')
view_event = data[data['event'] == 'pageview'].groupby('linkid')['event'].count().reset_index(name='view count')
merged_df = pd.merge(left=click_event, right=view_event, on='linkid', how='inner')
merged_df['click rate'] = merged_df['click count'] / merged_df['view count']
To my eyes at least, they are doing the same things but in a different order, the second snippet filters the data first, groups it, and then merges the sub-dataframes to reach the desired figures.
Can someone help shed some light on what I'm missing here?
I've tried your get_click_rate function here and it seems to be returning the same result as the second approach you've written. The only problem I've encountered with get_click_rate is when you try to compute the click rate for a linkid group that has no 'pageview' event. Because of that, I've made some minor changes to your get_click_rate function:
import pandas as pd
def get_click_rate(data: pd.DataFrame) -> pd.Series:
"""Calculate the click rate for a given ``linkid`` group.
Parameters
----------
data : pd.DataFrame
A dataframe representing values from a given ``linkid`` group,
containing the column 'event'.
Returns
-------
pd.Series
A series containing the click count, view count, and click rate.
The click rate is calculated as the ratio of the click count to
the view counts. If the view count is zero, click rate gets set to zero.
Examples
--------
>>> df = pd.DataFrame(
>>> {'event': ['click', 'pageview', 'click', 'some_other_value'],
>>> 'linkid': [1, 1, 2, 2]}
>>> )
>>> df.groupby('linkid').apply(get_click_rate).reset_index()
linkid click count view count click_rate
0 1 1.0 1.0 1.0
1 2 1.0 0.0 0.0
Notes
-----
This function returns a pandas Series regardless of whether the
``linkid`` group contains any view count or not. Therefore, if you want
only the ``linkid``s' that have click rates, you can use the following code:
.. code-block:: python
(
df.groupby('linkid')
.apply(get_click_rate)
.reset_index()
.loc[lambda xdf: xdf['click_rate'] > 0, :]
)
"""
click_count: int = data[data['event'] == 'click'].shape[0]
view_count: int = data[data['event'] == 'pageview'].shape[0]
click_rate: float = 0
# Only compute the click rate when `view_count` is greater than zero.
if view_count > 0:
click_rate = round(click_count / view_count, 2)
return pd.Series({'click count': click_count,
'view count': view_count,
'click_rate': click_rate})
Testing get_click_rate Function
import pandas as pd
import numpy as np
def get_click_rate(data: pd.DataFrame) -> pd.Series:
"""Calculate the click rate for a given ``linkid`` group.
Parameters
----------
data : pd.DataFrame
A dataframe representing values from a given ``linkid`` group,
containing the column 'event'.
Returns
-------
pd.Series
A series containing the click count, view count, and click rate.
The click rate is calculated as the ratio of the click count to
the view counts. If the view count is zero, click rate gets set to zero.
Examples
--------
>>> df = pd.DataFrame(
>>> {'event': ['click', 'pageview', 'click', 'some_other_value'],
>>> 'linkid': [1, 1, 2, 2]}
>>> )
>>> df.groupby('linkid').apply(get_click_rate).reset_index()
linkid click count view count click_rate
0 1 1.0 1.0 1.0
1 2 1.0 0.0 0.0
Notes
-----
This function returns a pandas Series regardless of whether the
``linkid`` group contains any view count or not. Therefore, if you want
only the ``linkid``s' that have click rates, you can use the following code:
.. code-block:: python
(
df.groupby('linkid')
.apply(get_click_rate)
.reset_index()
.loc[lambda xdf: xdf['click_rate'] > 0, :]
)
"""
click_count: int = data[data['event'] == 'click'].shape[0]
view_count: int = data[data['event'] == 'pageview'].shape[0]
click_rate: float = 0
# Only compute the click rate when `view_count` is greater than zero.
if view_count > 0:
click_rate = round(click_count / view_count, 2)
return pd.Series({'click count': click_count,
'view count': view_count,
'click_rate': click_rate})
event_choices = ['click', 'pageview', 'some_other_value']
linkid_choices = ['1', '2', '3']
nrows = 30
# -- Generating a Dummy DataFrame for Testing ------------------------------
df = pd.concat(
[
pd.DataFrame(
{
'event': np.random.choice(event_choices, nrows),
'linkid': np.random.choice(linkid_choices, nrows),
}
),
pd.DataFrame({'event': ['some_other_value'] * 3, 'linkid': '4'})
], ignore_index=True
)
(
# Group dataframe by column `linkid`
df.groupby('linkid')
# Apply function `get_click_rate` that returns a pandas Series with three
# columns ('click count', 'view count' and 'click_rate') for every 'linkid' value.
.apply(get_click_rate)
.reset_index()
# Convert the data type of 'click count', and 'view count' column to integers
.astype({'click count': int, 'view count': int})
# Filter for `linkid`s' that have a click rate greater than zero.
.loc[lambda xdf: xdf['click_rate'] > 0, :]
)
# Returns:
#
# linkid click count view count click_rate
# 0 1 2 3 0.67
# 1 2 3 3 1.00
# 2 3 3 6 0.50
Output:
Related
how to count between A and B in same columns dataframe (pandas)
count "CUT" in m/c code column between "STD" and "STD"
which are repeated in many time in columns
see below image attached
Another solution to your problem would be to create an auxiliary column that labels each interval. Then you can apply a groupby alongside the transform method, to perform the counting. Here's the code:
from __future__ import annotations
import pandas as pd
import numpy as np
# == Helper functions (Not part of the actual solution) =======================
# You can ignore these functions, as they don't actually are part of the
# solution, but rather a way to generate some data to test the implementation.
def random_dates(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
size: int = 10,
) -> pd.DatetimeIndex:
"""Generate random dates between two dates.
Parameters
----------
start_date : str | pd.Timestamp
Start date.
end_date : str | pd.Timestamp
End date.
size : int, optional
Number of dates to generate, by default 10.
Returns
-------
pd.DatetimeIndex
Random dates.
Examples
--------
>>> random_dates("2020-01-01", "2020-01-31", size=5) # doctest: +ELLIPSIS
DatetimeIndex(['2020-01-05', '2020-01-12', ...], dtype='datetime64[ns]', freq=None)
"""
start_u = pd.to_datetime(start_date).value // 10**9
end_u = pd.to_datetime(end_date).value // 10**9
return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="s")
def generate_random_frame(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
size: int = 10,
) -> pd.DataFrame:
"""
Generate a DataFrame to test the solution.
Parameters
----------
start_date : str | pd.Timestamp
Start date. Must be a string representing a date, like "YYYY-MM-DD",
or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp
object.
end_date : str | pd.Timestamp
End date. Must be a string representing a date, like "YYYY-MM-DD",
or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp.
size : int, default 10
Number of rows to generate.
Returns
-------
pd.DataFrame
DataFrame with random dates and random values. The resulting DataFrame
has the following columns:
- "Time": random datetimes between `start_date` and `end_date`.
- "m/c code": random strings from a set of 7 possible values:
"END", "CUT", "STD", "BL1", "ALS", "ST1", or "CLN".
"""
mc_code_choices = ["END", "CUT", "STD", "BL1", "ALS", "ST1", "CLN"]
return pd.DataFrame(
{
"Time": random_dates(start_date, end_date, size),
"m/c code": np.random.choice(mc_code_choices, size),
}
)
# == Solution ==================================================================
def flag_groups_and_count(
df: pd.DataFrame,
group_colname: str = "m/c code",
lowbound_value: str = "END",
upbound_value: str = "STD",
value_to_count: str = "CUT",
count_colname: str = "Count",
flag_colname: str = "FLAG",
) -> pd.DataFrame:
"""
Flag groups and count the number of times a specified value appears on each group.
Groups are defined by values between `lowbound_value` and `upbound_value`
in the column `group_colname`. The flag is set to 1 for the first group.
Subsequent groups are flagged as 2, 3, etc. Groups are flagged as 0
represent the absence of a group.
After flagging the groups, function counts the number of times
the value specified by the `value_to_count` parameter appears.
Parameters
----------
df : pd.DataFrame
DataFrame to flag.
group_colname : str, default "m/c code"
Column name to group by.
lowbound_value : str, default "END"
Value to start each group.
upbound_value : str, default "STD"
Value to end each group.
value_to_count : str, default "CUT"
Value to count inside the groupby.
count_colname : str, default "Count"
Name of the column to store the counts.
flag_colname : str, default "FLAG"
Name of the column to store each group.
Returns
-------
pd.DataFrame
Original DataFrame with the added flag column.
"""
# Set the initial parameters, used to control the creation of the groups.
current_group = 1 # The current group number.
flag_row = False # Indicates whether the current row should be flagged.
# Create the column that stores the group numbers.
# Set all values initially to 0
df[flag_colname] = 0
# Iterate over each row of the dataframe.
# - index: index of each row. Same values you find by calling df.index
# - row: a pandas Series object with the values of each row.
for index, row in df.iterrows():
# If the current row has a 'm/c code' value equal to 'END',
# then set the flag_row variable to True to indicate that
# the next rows should be set to `current_group` untill
# it finds a row with 'm/c code' value that equals to 'STD'.
if row[group_colname] == lowbound_value:
flag_row = True
# Does this row belong to a group? If so, set it to `current_group`.
if flag_row:
df.loc[df.index.isin([index]), flag_colname] = current_group
# If the current row has a 'm/c code' value equal to 'STD',
# then we reached the end of a group. Set the flag_row variable
# to False indicating that the next rows should not be flagged to a
# group.
if row[group_colname] == upbound_value:
# Did we reach the end of a group, or simply found another value
# equal to "STD" before the next interval starts?
# This is to avoid incrementing the group number when in fact we didn't
# reach a new interval.
if flag_row:
current_group += 1
flag_row = False
# Groupby 'm/c code' column values by the newly created flag column.
# Inside this groupby, use the `transform` method to count the number of
# times the value "CUT" appears inside each group.
# Store the count in a new column called "Count".
df[count_colname] = df.groupby(flag_colname, as_index=False)[
group_colname
].transform(lambda group: (group == value_to_count).sum())
# Same as:
# df["Count"] = df.groupby("FLAG", as_index=False)[
# "m/c code"
# ].transform(lambda group: (group == "CUT").sum())
# When the flag column is equal to 0, it means that there's no interval.
# Therefore, set such counts to 0. Intervals represent the rows with
# values for the 'm/c code' column between adjacent "END" and "STD" values.
df.loc[test_df[flag_colname] == 0, count_colname] = 0
# Same as: test_df.loc[test_df["FLAG"] == 0, "Count"] = 0
return df
# == Test our code =============================================================
# Parameters to use for generating test DataFrame:
start_date = "2021-07-01 00:00:00"
end_date = "2021-07-03 00:00:00"
# Generate test DataFrame
test_df = generate_random_frame(start_date, end_date, size=30)
# Call the function that defines the implementation to the problem.
test_df = test_df.pipe(flag_groups_and_count)
test_df
Here's a screenshot of the output:
There are obv many different ways.
One solution is this:
import pandas as pd
def c_counter(df, A, B, C):
starts = [i for i, x in enumerate(df['m/c code']) if x == 'A']
ends = [i for i, x in enumerate(df['m/c code']) if x == 'B']
df['Count'] = ''
for start, end in zip(starts, ends):
df['Count'][start:end] = sum(df['m/c code'][start:end] == 'C')
return(df)
df = pd.DataFrame({'m/c code': ['A', 'X', 'C', 'X', 'C', 'X', 'B', 'X', 'A', 'C', 'X', 'B']})
A = 'A'
B = 'B'
C = 'C'
c_counter(df, A, B, C)
Out:
m/c code Count
0 A 2
1 X 2
2 C 2
3 X 2
4 C 2
5 X 2
6 B
7 X
8 A 1
9 C 1
10 X 1
11 B
Next time, please make sure to include sample code.
I have:
haves = pd.DataFrame({'Product':['R123','R234'],
'Price':[1.18,0.23],
'CS_Medium':[1, 0],
'CS_Small':[0, 1],
'SC_A':[1,0],
'SC_B':[0,1],
'SC_C':[0,0]})
print(haves)
given a list of columns, like so:
list_of_starts_with = ["CS_", "SC_"]
I would like to arrive here:
wants = pd.DataFrame({'Product':['R123','R234'],
'Price':[1.18,0.23],
'CS':['Medium', 'Small'],
'SC':['A', 'B'],})
print(wants)
I am aware of wide_to_long but don't think it is applicable here?
We could convert "SC" and "CS" column values to boolean mask to filter the column names; then join it back to the original DataFrame:
msk = haves.columns.str.contains('_')
s = haves.loc[:, msk].astype(bool)
s = s.apply(lambda x: dict(s.columns[x].str.split('_')), axis=1)
out = haves.loc[:, ~msk].join(pd.DataFrame(s.tolist(), index=s.index))
Output:
Product Price CS SC
0 R123 1.18 Medium A
1 R234 0.23 Small B
Based on the list of columns (assuming the starts_with is enough to identify them), it is possible to do the changes in bulk:
def preprocess_column_names(list_of_starts_with, column_names):
"Returns a list of tuples (merged_column_name, options, columns)"
columns_to_transform = []
for starts_with in list_of_starts_with:
len_of_start = len(starts_with)
columns = [col for col in column_names if col.startswith(starts_with)]
options = [col[len_of_start:] for col in columns]
merged_column_name = starts_with[:-1] # Assuming that the last char is not needed
columns_to_transform.append((merged_column_name, options, columns))
return columns_to_transform
def merge_columns(df, merged_column_name, options, columns):
for col, option in zip(columns, options):
df.loc[df[col] == 1, merged_column_name] = option
return df.drop(columns=columns)
def merge_all(df, columns_to_transform):
for merged_column_name, options, columns in columns_to_transform:
df = merge_columns(df, merged_column_name, options, columns)
return df
And to run:
columns_to_transform = preprocess_column_names(list_of_starts_with, haves.columns)
wants = merge_all(haves, columns_to_transform)
If your column names are not surprising (such as Index_ being in list_of_starts_with) the above code should solve the problem with a reasonable performance.
One option is to convert the data to a long form, filter for rows that have a value of 1, then convert back to wide form. We can use pivot_longer from pyjanitor for the wide to long part, and pivot to return to wide form:
# pip install pyjanitor
import pandas as pd
import janitor
( haves
.pivot_longer(index=["Product", "Price"],
names_to=("main", "other"),
names_sep="_")
.query("value==1")
.pivot(index=["Product", "Price"],
columns="main",
values="other")
.rename_axis(columns=None)
.reset_index()
)
Product Price CS SC
0 R123 1.18 Medium A
1 R234 0.23 Small B
You can totally avoid pyjanitor, by tranforming on the columns before reshaping (it still involves wide to long, then long to wide):
index = [col for col in haves
if not col.startswith(tuple(list_of_starts_with))]
temp = haves.set_index(index)
temp.columns = (temp
.columns.str.split("_", expand=True)
.set_names(["main", "other"])
# reshape to get final dataframe
(temp
.stack(["main", "other"])
.loc[lambda df: df == 1]
.reset_index("other")
.drop(columns=0)
.unstack()
.droplevel(0, 1)
.rename_axis(columns=None)
.reset_index()
)
Product Price CS SC
0 R123 1.18 Medium A
1 R234 0.23 Small B
I have a dataframe:
Text
Background
Clinical
Method
Direct
Background
Direct
Now I want to group them in new column according to their first words like Background belong to group 1 Clinical belongs to group 2 and like this.
The expected output:
a dataframe:
Text Group
Background 1
Clinical 2
Method 3
Direct 4
Background 1
Direct 4
Try this:
import pandas as pd
text = ['Background', 'Clinical', 'Method', 'Direct', 'Background', 'Direct']
df = pd.DataFrame(text, columns=['Text'])
def create_idx_map():
idx = 1
values = {}
for item in list(df['Text']):
if item not in values:
values[item] = idx
idx += 1
return values
values = create_idx_map()
df['Group'] = [values[x] for x in list(df['Text'])]
print(df)
Idea: Make a list of unique values of the column Text and for the column Group you can assign the index of the value in this unique list. Code example:
df = pd.DataFrame({"Text": ["Background", "Clinical", "Clinical", "Method", "Background"]})
# List of unique values of column `Text`
groups = list(df["Text"].unique())
# Assign each value in `Text` its index
# (you can write `groups.index(text) + 1` when the first value shall be 1)
df["Group"] = df["Text"].map(lambda text: groups.index(text))
# Ouptut for df
print(df)
### Result:
Text Group
0 Background 0
1 Clinical 1
2 Clinical 1
3 Method 2
4 Background 0
A solution could be the following:
import pandas as pd
data = pd.DataFrame([["A B", 1], ["A C", 2], ["B A", 3], ["B C", 5]], columns=("name", "value"))
data.groupby(by=[x.split(" ")[0] for x in data.loc[:,"name"]])
You can select the first few words using x.split(" ")[:NUMBER_OF_WORDS]. You then apply the aggregation you want to the need object
I have a code which works perfectly and returns the ATR: Average True Range but it uses SMA: Simple Moving Average
How could I apply the same for all of RMA, EMA, WMA?
Here is the code:
def get_ATR(df, window:int=14, names:tuple = ('OPEN','CLOSE','LOW','HIGH'), return_df:bool = True):
'''
Get the Average True Range. Concept of Volatility
args:
df: Pandas Data Frame
window: Rolling window or the period you want to consider
names: Column names showing ('OPEN','CLOSE','LOW','HIGH') in the same order
return_df: Whether to return the whole Df or the latest value
'''
Open, Close, Low, High = names
data = df.copy()
if data.iloc[0,0] > data.iloc[1,0]: # if the first Date entry [0,0] is > previous data entry [1,0] then it is in descending order, then reverse it for calculation
data.sort_index(ascending=False, inplace = True)
high_low = data[High] - data[Low]
high_close = np.abs(data[High] - data[Close].shift())
low_close = np.abs(data[Low] - data[Close].shift())
ranges = pd.concat([high_low, high_close, low_close], axis=1)
true_range = np.max(ranges, axis=1)
ATR = true_range.rolling(window).sum()/window
data['ATR'] = ATR
data.sort_index(ascending=True, inplace = True)
if return_df:
return data
return data.iloc[0,-1]
For RMA:
true_range.ewm(alpha=1 / length, min_periods=length, adjust=False).mean()
I need to group my dataframe and use several aggregation functions on different columns. And some of this aggregation have conditions.
Here is an example. The data are all the orders from 2 customers and I would like to calculate some information on each customer. Like their orders count, their total spendings and average spendings.
import pandas as pd
data = {'order_id' : range(1,9),
'cust_id' : [1]*5 + [2]*3,
'order_amount' : [100,50,70,75,80,105,30,20],
'cust_days_since_reg' : [0,10,25,37,52,0,17,40]}
orders = pd.DataFrame(data)
aggregation = {'order_id' : 'count',
'order_amount' : ['sum', 'mean']}
cust = orders.groupby('cust_id').agg(aggregation).reset_index()
cust.columns = ['_'.join(col) for col in cust.columns.values]
This works fine and gives me :
_
But I have to add an aggregation function with a argument and a condition : the amount a customer spent in his first X months (X must be customizable)
Since I need an argument in this aggregation I tried :
def spendings_X_month(group, n_months):
return group.loc[group['cust_days_since_reg'] <= n_months*30,
'order_amount'].sum()
aggregation = {'order_id' : 'count',
'order_amount' : ['sum',
'mean',
lambda x: spendings_X_month(x, 1)]}
cust = orders.groupby('cust_id').agg(aggregation).reset_index()
But that last line gets me the error : KeyError: 'cust_days_since_reg'.
It must be a scoping error, the cust_days_since_reg column must not be visible in this situation.
I could calculate this last column separately and then join the resulting dataframe to the first but there must be a better solution, that makes every thing in only one groupby.
Could anyone help me with this problem please ?
Thank You
You cannot use agg, because each function working only with one column, so this kind of filtering based of another col is not possible.
Solution use GroupBy.apply:
def spendings_X_month(group, n_months):
a = group['order_id'].count()
b = group['order_amount'].sum()
c = group['order_amount'].mean()
d = group.loc[group['cust_days_since_reg'] <= n_months*30,
'order_amount'].sum()
cols = ['order_id_count','order_amount_sum','order_amount_mean','order_amount_spendings']
return pd.Series([a,b,c,d], index=cols)
cust = orders.groupby('cust_id').apply(spendings_X_month, 1).reset_index()
print (cust)
cust_id order_id_count order_amount_sum order_amount_mean \
0 1 5.0 375.0 75.000000
1 2 3.0 155.0 51.666667
order_amount_spendings
0 220.0
1 135.0