how to count between A and B in same columns dataframe (pandas)
count "CUT" in m/c code column between "STD" and "STD"
which are repeated in many time in columns
see below image attached
Another solution to your problem would be to create an auxiliary column that labels each interval. Then you can apply a groupby alongside the transform method, to perform the counting. Here's the code:
from __future__ import annotations
import pandas as pd
import numpy as np
# == Helper functions (Not part of the actual solution) =======================
# You can ignore these functions, as they don't actually are part of the
# solution, but rather a way to generate some data to test the implementation.
def random_dates(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
size: int = 10,
) -> pd.DatetimeIndex:
"""Generate random dates between two dates.
Parameters
----------
start_date : str | pd.Timestamp
Start date.
end_date : str | pd.Timestamp
End date.
size : int, optional
Number of dates to generate, by default 10.
Returns
-------
pd.DatetimeIndex
Random dates.
Examples
--------
>>> random_dates("2020-01-01", "2020-01-31", size=5) # doctest: +ELLIPSIS
DatetimeIndex(['2020-01-05', '2020-01-12', ...], dtype='datetime64[ns]', freq=None)
"""
start_u = pd.to_datetime(start_date).value // 10**9
end_u = pd.to_datetime(end_date).value // 10**9
return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="s")
def generate_random_frame(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
size: int = 10,
) -> pd.DataFrame:
"""
Generate a DataFrame to test the solution.
Parameters
----------
start_date : str | pd.Timestamp
Start date. Must be a string representing a date, like "YYYY-MM-DD",
or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp
object.
end_date : str | pd.Timestamp
End date. Must be a string representing a date, like "YYYY-MM-DD",
or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp.
size : int, default 10
Number of rows to generate.
Returns
-------
pd.DataFrame
DataFrame with random dates and random values. The resulting DataFrame
has the following columns:
- "Time": random datetimes between `start_date` and `end_date`.
- "m/c code": random strings from a set of 7 possible values:
"END", "CUT", "STD", "BL1", "ALS", "ST1", or "CLN".
"""
mc_code_choices = ["END", "CUT", "STD", "BL1", "ALS", "ST1", "CLN"]
return pd.DataFrame(
{
"Time": random_dates(start_date, end_date, size),
"m/c code": np.random.choice(mc_code_choices, size),
}
)
# == Solution ==================================================================
def flag_groups_and_count(
df: pd.DataFrame,
group_colname: str = "m/c code",
lowbound_value: str = "END",
upbound_value: str = "STD",
value_to_count: str = "CUT",
count_colname: str = "Count",
flag_colname: str = "FLAG",
) -> pd.DataFrame:
"""
Flag groups and count the number of times a specified value appears on each group.
Groups are defined by values between `lowbound_value` and `upbound_value`
in the column `group_colname`. The flag is set to 1 for the first group.
Subsequent groups are flagged as 2, 3, etc. Groups are flagged as 0
represent the absence of a group.
After flagging the groups, function counts the number of times
the value specified by the `value_to_count` parameter appears.
Parameters
----------
df : pd.DataFrame
DataFrame to flag.
group_colname : str, default "m/c code"
Column name to group by.
lowbound_value : str, default "END"
Value to start each group.
upbound_value : str, default "STD"
Value to end each group.
value_to_count : str, default "CUT"
Value to count inside the groupby.
count_colname : str, default "Count"
Name of the column to store the counts.
flag_colname : str, default "FLAG"
Name of the column to store each group.
Returns
-------
pd.DataFrame
Original DataFrame with the added flag column.
"""
# Set the initial parameters, used to control the creation of the groups.
current_group = 1 # The current group number.
flag_row = False # Indicates whether the current row should be flagged.
# Create the column that stores the group numbers.
# Set all values initially to 0
df[flag_colname] = 0
# Iterate over each row of the dataframe.
# - index: index of each row. Same values you find by calling df.index
# - row: a pandas Series object with the values of each row.
for index, row in df.iterrows():
# If the current row has a 'm/c code' value equal to 'END',
# then set the flag_row variable to True to indicate that
# the next rows should be set to `current_group` untill
# it finds a row with 'm/c code' value that equals to 'STD'.
if row[group_colname] == lowbound_value:
flag_row = True
# Does this row belong to a group? If so, set it to `current_group`.
if flag_row:
df.loc[df.index.isin([index]), flag_colname] = current_group
# If the current row has a 'm/c code' value equal to 'STD',
# then we reached the end of a group. Set the flag_row variable
# to False indicating that the next rows should not be flagged to a
# group.
if row[group_colname] == upbound_value:
# Did we reach the end of a group, or simply found another value
# equal to "STD" before the next interval starts?
# This is to avoid incrementing the group number when in fact we didn't
# reach a new interval.
if flag_row:
current_group += 1
flag_row = False
# Groupby 'm/c code' column values by the newly created flag column.
# Inside this groupby, use the `transform` method to count the number of
# times the value "CUT" appears inside each group.
# Store the count in a new column called "Count".
df[count_colname] = df.groupby(flag_colname, as_index=False)[
group_colname
].transform(lambda group: (group == value_to_count).sum())
# Same as:
# df["Count"] = df.groupby("FLAG", as_index=False)[
# "m/c code"
# ].transform(lambda group: (group == "CUT").sum())
# When the flag column is equal to 0, it means that there's no interval.
# Therefore, set such counts to 0. Intervals represent the rows with
# values for the 'm/c code' column between adjacent "END" and "STD" values.
df.loc[test_df[flag_colname] == 0, count_colname] = 0
# Same as: test_df.loc[test_df["FLAG"] == 0, "Count"] = 0
return df
# == Test our code =============================================================
# Parameters to use for generating test DataFrame:
start_date = "2021-07-01 00:00:00"
end_date = "2021-07-03 00:00:00"
# Generate test DataFrame
test_df = generate_random_frame(start_date, end_date, size=30)
# Call the function that defines the implementation to the problem.
test_df = test_df.pipe(flag_groups_and_count)
test_df
Here's a screenshot of the output:
There are obv many different ways.
One solution is this:
import pandas as pd
def c_counter(df, A, B, C):
starts = [i for i, x in enumerate(df['m/c code']) if x == 'A']
ends = [i for i, x in enumerate(df['m/c code']) if x == 'B']
df['Count'] = ''
for start, end in zip(starts, ends):
df['Count'][start:end] = sum(df['m/c code'][start:end] == 'C')
return(df)
df = pd.DataFrame({'m/c code': ['A', 'X', 'C', 'X', 'C', 'X', 'B', 'X', 'A', 'C', 'X', 'B']})
A = 'A'
B = 'B'
C = 'C'
c_counter(df, A, B, C)
Out:
m/c code Count
0 A 2
1 X 2
2 C 2
3 X 2
4 C 2
5 X 2
6 B
7 X
8 A 1
9 C 1
10 X 1
11 B
Next time, please make sure to include sample code.
Related
I'm trying to apply a function to each segment/partition that is the result of a groupby operation.
def get_click_rate(data):
click_count = data[data['event'] == 'click'].shape[0]
view_count = data[data['event'] == 'pageview'].shape[0]
return click_count / view_count
data.groupby('linkid').apply(get_click_rate).reset_index(name='click rate')
The idea here is that I'm grouping the dataframe by LinkID of a webpage, and then each partition I'm passing to a function that filters the sub-dataframe, computes a number and returns it. However it returns wrong figures. Here is the snippet that returns the right figures
click_event = data[data['event'] == 'click'].groupby('linkid')['event'].count().reset_index(name='click count')
view_event = data[data['event'] == 'pageview'].groupby('linkid')['event'].count().reset_index(name='view count')
merged_df = pd.merge(left=click_event, right=view_event, on='linkid', how='inner')
merged_df['click rate'] = merged_df['click count'] / merged_df['view count']
To my eyes at least, they are doing the same things but in a different order, the second snippet filters the data first, groups it, and then merges the sub-dataframes to reach the desired figures.
Can someone help shed some light on what I'm missing here?
I've tried your get_click_rate function here and it seems to be returning the same result as the second approach you've written. The only problem I've encountered with get_click_rate is when you try to compute the click rate for a linkid group that has no 'pageview' event. Because of that, I've made some minor changes to your get_click_rate function:
import pandas as pd
def get_click_rate(data: pd.DataFrame) -> pd.Series:
"""Calculate the click rate for a given ``linkid`` group.
Parameters
----------
data : pd.DataFrame
A dataframe representing values from a given ``linkid`` group,
containing the column 'event'.
Returns
-------
pd.Series
A series containing the click count, view count, and click rate.
The click rate is calculated as the ratio of the click count to
the view counts. If the view count is zero, click rate gets set to zero.
Examples
--------
>>> df = pd.DataFrame(
>>> {'event': ['click', 'pageview', 'click', 'some_other_value'],
>>> 'linkid': [1, 1, 2, 2]}
>>> )
>>> df.groupby('linkid').apply(get_click_rate).reset_index()
linkid click count view count click_rate
0 1 1.0 1.0 1.0
1 2 1.0 0.0 0.0
Notes
-----
This function returns a pandas Series regardless of whether the
``linkid`` group contains any view count or not. Therefore, if you want
only the ``linkid``s' that have click rates, you can use the following code:
.. code-block:: python
(
df.groupby('linkid')
.apply(get_click_rate)
.reset_index()
.loc[lambda xdf: xdf['click_rate'] > 0, :]
)
"""
click_count: int = data[data['event'] == 'click'].shape[0]
view_count: int = data[data['event'] == 'pageview'].shape[0]
click_rate: float = 0
# Only compute the click rate when `view_count` is greater than zero.
if view_count > 0:
click_rate = round(click_count / view_count, 2)
return pd.Series({'click count': click_count,
'view count': view_count,
'click_rate': click_rate})
Testing get_click_rate Function
import pandas as pd
import numpy as np
def get_click_rate(data: pd.DataFrame) -> pd.Series:
"""Calculate the click rate for a given ``linkid`` group.
Parameters
----------
data : pd.DataFrame
A dataframe representing values from a given ``linkid`` group,
containing the column 'event'.
Returns
-------
pd.Series
A series containing the click count, view count, and click rate.
The click rate is calculated as the ratio of the click count to
the view counts. If the view count is zero, click rate gets set to zero.
Examples
--------
>>> df = pd.DataFrame(
>>> {'event': ['click', 'pageview', 'click', 'some_other_value'],
>>> 'linkid': [1, 1, 2, 2]}
>>> )
>>> df.groupby('linkid').apply(get_click_rate).reset_index()
linkid click count view count click_rate
0 1 1.0 1.0 1.0
1 2 1.0 0.0 0.0
Notes
-----
This function returns a pandas Series regardless of whether the
``linkid`` group contains any view count or not. Therefore, if you want
only the ``linkid``s' that have click rates, you can use the following code:
.. code-block:: python
(
df.groupby('linkid')
.apply(get_click_rate)
.reset_index()
.loc[lambda xdf: xdf['click_rate'] > 0, :]
)
"""
click_count: int = data[data['event'] == 'click'].shape[0]
view_count: int = data[data['event'] == 'pageview'].shape[0]
click_rate: float = 0
# Only compute the click rate when `view_count` is greater than zero.
if view_count > 0:
click_rate = round(click_count / view_count, 2)
return pd.Series({'click count': click_count,
'view count': view_count,
'click_rate': click_rate})
event_choices = ['click', 'pageview', 'some_other_value']
linkid_choices = ['1', '2', '3']
nrows = 30
# -- Generating a Dummy DataFrame for Testing ------------------------------
df = pd.concat(
[
pd.DataFrame(
{
'event': np.random.choice(event_choices, nrows),
'linkid': np.random.choice(linkid_choices, nrows),
}
),
pd.DataFrame({'event': ['some_other_value'] * 3, 'linkid': '4'})
], ignore_index=True
)
(
# Group dataframe by column `linkid`
df.groupby('linkid')
# Apply function `get_click_rate` that returns a pandas Series with three
# columns ('click count', 'view count' and 'click_rate') for every 'linkid' value.
.apply(get_click_rate)
.reset_index()
# Convert the data type of 'click count', and 'view count' column to integers
.astype({'click count': int, 'view count': int})
# Filter for `linkid`s' that have a click rate greater than zero.
.loc[lambda xdf: xdf['click_rate'] > 0, :]
)
# Returns:
#
# linkid click count view count click_rate
# 0 1 2 3 0.67
# 1 2 3 3 1.00
# 2 3 3 6 0.50
Output:
I have a dataset with column of type StringType(). The values in these money columns contains abbreviations like K and M.
I would like to remove 'K' and 'M' and multiple them either by 1000 or 1000000 for K/M respectively. I tried creating a function and use it to add a new column in the dataframe. I keep getting the following error
ValueError: Cannot convert column into bool: please use '&' for 'and',
'|' for 'or', '~' for 'not' when building DataFrame boolean
expressions.
The Column values are as follows:
def convertall(ReleaseClause):
if ReleaseClause == None:
return 0
elif expr("substring(ReleaseClause,-1,length(ReleaseClause))")=='K':
remove_euro=expr("substring(ReleaseClause,2,length(ReleaseClause))")
remove_K=translate(remove_euro,'K','')
remove_Kint=remove_K.cast(IntegerType())*lit(1000)
return remove_Kint
elif expr("substring(ReleaseClause,-1,length(ReleaseClause))")=='M':
remove_euro=expr("substring(ReleaseClause,2,length(ReleaseClause))")
remove_M=translate(remove_euro,'M','')
remove_Mint=remove_M.cast(IntegerType())*lit(1000000)
return remove_Mint
else:
return ReleaseClause
The following code converts the data using F.when() function. Before that, the string is split into letters, then M/K symbol is extracted, as well as the amount to be multiplied. This solution assumes the string size remains the same and the position of M/K symbol as well as the amount data is not variable across rows.
import pyspark.sql.functions as F
data = [("$12.3M",),
("$23.4K",),
("$12.5M",),
("$22.3K",)]
df = spark.createDataFrame(data, schema=["ReleaseClause"])
df_ans = (df
.select("ReleaseClause",
(F.split("ReleaseClause",'').alias("split")))
.withColumn("scale", F.col("split")[5])
.withColumn("amount",
F.concat(F.col("split")[1], F.col("split")[2],
F.col("split")[3], F.col("split")[4])
.cast("double"))
.withColumn("scaled", F.when(F.col("scale")=="K",
F.col("amount")*1000)
.when(F.col("scale")=="M",
F.col("amount")*1000000))
This produces output as
You could check if value contains K or M, extract the number and multiply.
Example:
data = [("$12.3M",), ("$23.4K",), ("$12.5M",), ("$22.3K",)]
df = spark.createDataFrame(data, schema=["ReleaseClause"])
df = df.withColumn(
"result",
F.when(
F.col("ReleaseClause").contains("K"),
F.regexp_extract(F.col("ReleaseClause"), "(\d+(.\d+)?)", 1).cast(DoubleType())
* 1000,
)
.when(
F.col("ReleaseClause").contains("M"),
F.regexp_extract(F.col("ReleaseClause"), "(\d+(.\d+)?)", 1).cast(DoubleType())
* 1000000,
)
.cast(IntegerType()),
)
Result:
+-------------+--------+
|ReleaseClause|result |
+-------------+--------+
|$12.3M |12300000|
|$23.4K |23400 |
|$12.5M |12500000|
|$22.3K |22300 |
+-------------+--------+
Consider this simple pandas DataFrame with columns 'record', 'start', and 'param'. There can be multiple rows with the same record value, and each unique record value corresponds to the same start value. However, the 'param' value can be different for the same 'record' and 'start' combination:
pd.DataFrame({'record':[1,2,3,4,4,5,6,7,7,7,8], 'start':[0,5,7,13,13,19,27,38,38,38,54], 'param':['t','t','t','u','v','t','t','t','u','v','t']})
I'd like to make a column 'end' that takes the value of 'start' in the row with the next unique value of 'record'. The values of column 'end' should be:
[5,7,13,19,19,27,38,54,54,54,NaN]
I'm able to do this using a for loop, but I know this is not preferred when using pandas:
max_end = 100
for idx, row in df.iterrows():
try:
n = 1
next_row = df.iloc[idx+n]
while next_row['start'] == row['start']:
n = n+1
next_row = df.iloc[idx+n]
end = next_row['start']
except:
end = max_end
df.at[idx, 'end'] = end
Is there an easy way to achieve this without a for loop?
I have no doubt there is a smarter solution but here is mine.
df1['end'] = df1.drop_duplicates(subset = ['record', 'start'])['start'].shift(-1).reindex(index = df1.index, method = 'ffill')
-=EDIT=-
Added subset into drop_duplicates to account for question amendment
This solution is equivalent to #Quixotic22 although more explicit.
df = pd.DataFrame({
'record':[1,2,3,4,4,5,6,7,7,7,8],
'start':[0,5,7,13,13,19,27,38,38,38,54],
'param':['t','t','t','u','v','t','t','t','u','v','t']
})
max_end = 100
df["end"] = None # create new column with empty values
loc = df["record"].shift(1) != df["record"] # record where the next value is diff from previous
df.loc[loc, "end"] = df.loc[loc, "start"].shift(-1) # assign desired values
df["end"].fillna(method = "ffill", inplace = True) # fill remaining missing values
df.loc[df.index[-1], "end"] = max_end # override last value
df
I have a dataframe:
Text
Background
Clinical
Method
Direct
Background
Direct
Now I want to group them in new column according to their first words like Background belong to group 1 Clinical belongs to group 2 and like this.
The expected output:
a dataframe:
Text Group
Background 1
Clinical 2
Method 3
Direct 4
Background 1
Direct 4
Try this:
import pandas as pd
text = ['Background', 'Clinical', 'Method', 'Direct', 'Background', 'Direct']
df = pd.DataFrame(text, columns=['Text'])
def create_idx_map():
idx = 1
values = {}
for item in list(df['Text']):
if item not in values:
values[item] = idx
idx += 1
return values
values = create_idx_map()
df['Group'] = [values[x] for x in list(df['Text'])]
print(df)
Idea: Make a list of unique values of the column Text and for the column Group you can assign the index of the value in this unique list. Code example:
df = pd.DataFrame({"Text": ["Background", "Clinical", "Clinical", "Method", "Background"]})
# List of unique values of column `Text`
groups = list(df["Text"].unique())
# Assign each value in `Text` its index
# (you can write `groups.index(text) + 1` when the first value shall be 1)
df["Group"] = df["Text"].map(lambda text: groups.index(text))
# Ouptut for df
print(df)
### Result:
Text Group
0 Background 0
1 Clinical 1
2 Clinical 1
3 Method 2
4 Background 0
A solution could be the following:
import pandas as pd
data = pd.DataFrame([["A B", 1], ["A C", 2], ["B A", 3], ["B C", 5]], columns=("name", "value"))
data.groupby(by=[x.split(" ")[0] for x in data.loc[:,"name"]])
You can select the first few words using x.split(" ")[:NUMBER_OF_WORDS]. You then apply the aggregation you want to the need object
I'm trying to add rows and columns to pandas incrementally. I have a lot of data stored across multiple datastores and a heuristic to determine a value. As I navigate across this datastore, I'd like to be able to incrementally update a dataframe, where in some cases, either names or days will be missing.
def foo():
df = pd.DataFrame()
year = 2016
names = ['Bill', 'Bob', 'Ryan']
for day in range(1, 4, 1):
for name in names:
if random.choice([True, False]): # sometimes a name will be missing
continue
value = random.randrange(0, 20, 1) # random value from heuristic
col = '{}_{}'.format(year, day) # column name
df = df.append({col: value, 'name': name}, ignore_index=True)
df.set_index('name', inplace=True, drop=True)
print(df.loc['Bill'])
This produces the following results:
2016_1 2016_2 2016_3
name
Bill 15.0 NaN NaN
Bill NaN 12.0 NaN
I've created a heatmap of the data and it's blocky due to duplicate names, so the output I'm looking for is:
2016_1 2016_2 2016_3
name
Bill 15.0 12.0 NaN
How can I combine these rows?
Is there a more efficient means of creating this dataframe?
Try this :-
df.groupby('name')[df.columns.values].sum()
try this:
df.pivot_table(index='name', aggfunc='sum', dropna=False)
After you run your foo() function, you can use any aggregation function (if you have only one value per column and all the othes are null) and groupby on df.
First, use reset_index to get back your name column.
Then use groupby and apply. Here I propose a custom function which checks that there is only one value per column, and raise a ValueError if not.
df.reset_index(inplace=True)
def aggdata(x):
if all([i <= 1 for i in x.count()]):
return x.mean()
else:
raise ValueError
ddf = df.groupby('name').apply(aggdata)
If all the values of the column are null but one, x.mean() will return that value (actually, you can use almost any aggregator, since there is only one value, that is the one returned).
It would be easier to have the name as column and date as index instead. Plus, you can work within the loop with lists and afterwards create the pd.DataFrame.
e.g.
year = 2016
names = ['Bill', 'Bob', 'Ryan']
index = []
valueBill = []
valueBob = []
valueRyan = []
for day in range(1, 4):
if random.choice([True, False]): # sometimes a name will be missing
valueBill.append(random.randrange(0, 20))
valueBob.append(random.randrange(0, 90))
valueRyan.append(random.randrange(0, 200))
index.append('{}-0{}'.format(year, day)) # column name
else:
valueBill.append(np.nan)
valueBob.append(np.nan)
valueRyan.append(np.nan)
index.append(np.nan)
df = pd.DataFrame({})
for name, value in zip(names,[valueBill,valueBob,valueRyan]):
df[name] = value
df.set_index(pd.to_datetime(index))
You can append the entries with new names if it does not already exist and then do an update to update existing entries.
import pandas as pd
import random
def foo():
df = pd.DataFrame()
year = 2016
names = ['Bill', 'Bob', 'Ryan']
for day in range(1, 4, 1):
for name in names:
if random.choice([True, False]): # sometimes a name will be missing
continue
value = random.randrange(0, 20, 1) # random value from heuristic
col = '{}_{}'.format(year, day) # column name
new_df = pd.DataFrame({col: value, 'name':name}, index=[1]).set_index('name')
df = pd.concat([df,new_df[~new_df.index.isin(df.index)].dropna()])
df.update(new_df)
#df.set_index('name', inplace=True, drop=True)
print(df)