Python-polars: expanding window groupby - python

I'd like to calculate aggregated metrics with an expanding window. Basically, given the following dataframe:
from datetime import date
import polars as pl
df = pl.DataFrame({"Day":[date(2022, 1, i) for i in range(1,10)], "value":[1,2,3,4,5,6,7,8,9]})
shape: (9, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-04 ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-07 ┆ 7 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-09 ┆ 9 │
└────────────┴───────┘
What I'm after is basically this:
|--|
|-----|
|--------|
I tried to use groupby_rolling and groupby_dynamic, but I couldn't get it to fix the initial time of each group to the first timestamp. My current workaround is something like this:
date_range = pl.date_range(df.select("Day").min().row(0)[0], df.select("Day").max().row(0)[0], '1w',)
for timestamp in date_range:
print(df.filter(pl.col('Day').is_between(date_range[0], timestamp, include_bounds=True)))
shape: (1, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
└────────────┴───────┘
shape: (8, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-04 ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-05 ┆ 5 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 │
...
│ 2022-01-07 ┆ 7 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 │
└────────────┴───────┘
This gives me the exact aggregation I'm after, but I feel like there's a much more efficient way of doing this - and I'd especially like to do my aggregations within a groupby context.

from datetime import date
import polars as pl
df = pl.DataFrame({"Day":[date(2022, 1, i) for i in range(1,10)], "value":[1,2,3,4,5,6,7,8,9]})
shape: (9, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-04 ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-07 ┆ 7 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-09 ┆ 9 │
└────────────┴───────┘

Not sure if it's possible with dynamic/rolling but you could create a dataframe from your date range and do a cross join.
>>> start = df.get_column("Day").min()
... end = df.get_column("Day").max()
... date_range = (
... pl.date_range(start, end, interval="1w").to_frame("end")
... .with_row_count(name="group")
... )
>>> date_range
shape: (2, 2)
┌───────┬────────────┐
│ group ┆ end │
│ --- ┆ --- │
│ u32 ┆ date │
╞═══════╪════════════╡
│ 0 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2022-01-08 │
└───────┴────────────┘
You can then run your filter and be left with a group identifier:
>>> (
... df
... .join(date_range, left_on="Day", right_on="end", how="cross")
... .with_column(pl.lit(start).alias("start"))
... .filter(
... pl.col("Day").is_between(
... pl.col("start"),
... pl.col("end"),
... include_bounds=True))
... .drop(["start", "end"])
... )
shape: (9, 3)
┌────────────┬───────┬───────┐
│ Day ┆ value ┆ group │
│ --- ┆ --- ┆ --- │
│ date ┆ i64 ┆ u32 │
╞════════════╪═══════╪═══════╡
│ 2022-01-01 ┆ 1 ┆ 0 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-01 ┆ 1 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-05 ┆ 5 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-07 ┆ 7 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 ┆ 1 │
└────────────┴───────┴───────┘

Related

How to add a duration to datetime in Python polars

I want to add a duration in seconds to a date/time. My data looks like
import polars as pl
df = pl.DataFrame(
{
"dt": [
"2022-12-14T00:00:00", "2022-12-14T00:00:00", "2022-12-14T00:00:00",
],
"seconds": [
1.0, 2.2, 2.4,
],
}
)
df = df.with_column(pl.col("dt").str.strptime(pl.Datetime).cast(pl.Datetime))
Now my naive attempt was to to convert the float column to duration type to be able to add it to the datetime column (as I would do in pandas).
df = df.with_column(pl.col("seconds").cast(pl.Duration).alias("duration0"))
print(df.head())
┌─────────────────────┬─────────┬──────────────┐
│ dt ┆ seconds ┆ duration0 │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ f64 ┆ duration[μs] │
╞═════════════════════╪═════════╪══════════════╡
│ 2022-12-14 00:00:00 ┆ 1.0 ┆ 0µs │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 00:00:00 ┆ 2.2 ┆ 0µs │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 00:00:00 ┆ 2.4 ┆ 0µs │
└─────────────────────┴─────────┴──────────────┘
...gives the correct data type, however the values are all zero.
I also tried
df = df.with_column(
pl.col("seconds")
.apply(lambda x: pl.duration(nanoseconds=x * 1e9))
.alias("duration1")
)
print(df.head())
shape: (3, 4)
┌─────────────────────┬─────────┬──────────────┬─────────────────────────────────────┐
│ dt ┆ seconds ┆ duration0 ┆ duration1 │
│ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ f64 ┆ duration[μs] ┆ object │
╞═════════════════════╪═════════╪══════════════╪═════════════════════════════════════╡
│ 2022-12-14 00:00:00 ┆ 1.0 ┆ 0µs ┆ 0i64.duration([0i64, 1000000000f... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 00:00:00 ┆ 2.2 ┆ 0µs ┆ 0i64.duration([0i64, 2200000000f... │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2022-12-14 00:00:00 ┆ 2.4 ┆ 0µs ┆ 0i64.duration([0i64, 2400000000f... │
└─────────────────────┴─────────┴──────────────┴─────────────────────────────────────┘
which gives an object type column which isn't helpful either. The documentation is kind of sparse on the topic, any better options?
Update: The values being zero is a repr formatting issue that has been fixed with this commit.
pl.duration() can be used in this way:
>>> df.with_column(
... pl.col("dt").str.strptime(pl.Datetime)
... + pl.duration(nanoseconds=pl.col("seconds") * 1e9)
... )
shape: (3, 2)
┌─────────────────────────┬─────────┐
│ dt | seconds │
│ --- | --- │
│ datetime[μs] | f64 │
╞═════════════════════════╪═════════╡
│ 2022-12-14 00:00:01 | 1.0 │
├─────────────────────────┼─────────┤
│ 2022-12-14 00:00:02.200 | 2.2 │
├─────────────────────────┼─────────┤
│ 2022-12-14 00:00:02.400 | 2.4 │
└─//──────────────────────┴─//──────┘

How to add a new field with the counts per group criteria in python polars?

I have a small use case and here is a polars dataframe.
df_names = pl.DataFrame({'LN'['Mallesham','Bhavik','Mallesham','Bhavik','Mahesh','Naresh','Sharath','Rakesh','Mallesham'],
'FN':['Yamulla','Yamulla','Yamulla','Yamulla','Dayala','Burre','Velmala','Uppu','Yamulla'],
'SSN':['123','456','123','456','893','111','222','333','123'],
'Address':['A','B','C','D','E','F','G','H','S']})
Here I would like to group on LN,FN,SSN and create a new column in which how many number of observations for this group combination and below is the expected output.
'Mallesham','Yamulla','123' is appeared 3 times, hence LN_FN_SSN_count field is filled up with 3.
You can use an expression using over (which is like grouping, aggregating and self-joining in other libs, but without the need for the join):
df_names.with_column(pl.count().over(['LN', 'FN', 'SSN']).alias('LN_FN_SSN_count'))
┌───────────┬─────────┬─────┬─────────┬─────────────────┐
│ LN ┆ FN ┆ SSN ┆ Address ┆ LN_FN_SSN_count │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ u32 │
╞═══════════╪═════════╪═════╪═════════╪═════════════════╡
│ Mallesham ┆ Yamulla ┆ 123 ┆ A ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Bhavik ┆ Yamulla ┆ 456 ┆ B ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Mallesham ┆ Yamulla ┆ 123 ┆ C ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Bhavik ┆ Yamulla ┆ 456 ┆ D ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Naresh ┆ Burre ┆ 111 ┆ F ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Sharath ┆ Velmala ┆ 222 ┆ G ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Rakesh ┆ Uppu ┆ 333 ┆ H ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Mallesham ┆ Yamulla ┆ 123 ┆ S ┆ 3 │
└───────────┴─────────┴─────┴─────────┴─────────────────┘

Repeating a date in polars and exploding it

I have a polars dataframe with two date columns that represent a start and end date and then a value that I want to repeat for all dates in between those two dates so that I can join those on other tables.
Example input is
id
start
end
value
123
2022-01-01
2022-01-04
10
abc
2022-03-04
2022-03-04
3
456
2022-05-11
2022-05-16
4
and expected output is
id
date
value
123
2022-01-01
10
123
2022-01-02
10
123
2022-01-03
10
123
2022-01-04
10
abc
2022-03-04
3
456
2022-05-11
4
456
2022-05-12
4
456
2022-05-13
4
456
2022-05-14
4
456
2022-05-15
4
456
2022-05-16
4
I struggled today with the same problem and I thought I share my solution.
As cbilot already mentions pl.dat_range doesn't take expressions as low and high value. So I worked around by using apply.
Data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
Solution:
(
df.with_columns(
[(pl.struct(["start", "end"])
.apply(lambda x: pl.date_range(x["start"], x["end"], "1d"))
.alias("date"))])
.explode(pl.col("date"))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
Starting with this data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
df
shape: (3, 4)
┌─────┬────────────┬────────────┬───────┐
│ id ┆ start ┆ end ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 │
╞═════╪════════════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴────────────┴───────┘
The Algorithm
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
In steps
Normally, we create a range of dates using the date_range expression. However, date_range does not take an Expression as its low and high parameters.
However, arange does allow Expressions as its low and high parameters. We can (implicitly) cast the start and end dates to integers, which represent the number of days since the UNIX epoch.
The result is a list of integers which represents the days between (and including) the start and end dates (expressed as days since the UNIX epoch)..
Notice that we have to add 1 to the high parameter to make sure we capture the end date.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
)
shape: (3, 5)
┌─────┬────────────┬────────────┬───────┬───────────────────────────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ list[i64] │
╞═════╪════════════╪════════════╪═══════╪═══════════════════════════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ [18993, 18994, ... 18996] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ [19055] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ [19123, 19124, ... 19128] │
└─────┴────────────┴────────────┴───────┴───────────────────────────┘
Next we can use explode to place each of the integers on a separate row.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
)
shape: (11, 5)
┌─────┬────────────┬────────────┬───────┬───────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ i64 │
╞═════╪════════════╪════════════╪═══════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18993 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18994 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18995 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18996 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ 19055 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19123 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19124 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19125 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19126 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19127 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19128 │
└─────┴────────────┴────────────┴───────┴───────┘
The final step is to cast the date column back to a pl.Date, and then select only the columns that we want.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘

how to calculate pct_change by polars?

Now I have a dataframe like this:
df = pd.DataFrame({"asset":["a","b","c","a","b","c","b","c"],"v":[1,2,3,4,5,6,7,8],"date":["2017","2011","2012","2013","2014","2015","2016","2010"]})
I can calculate the pct_change by groupby and my function like this:
def fun(df):
df = df.sort_values(by="date")
df["pct_change"] = df["v"].pct_change()
return df
df = df.groupby("asset",as_index=False).apply(fun)
Now I want to know how can I get the same result by polars?
Here are two options. One using window functions, and one using groupby + explode.
You should benchmark and see which is faster on your use case.
preparing data
df = pl.DataFrame({
"asset":["a","b","c","a","b","c","b","c"],
"v":[1,2,3,4,5,6,7,8],
"date":["2017","2011","2012","2013","2014","2015","2016","2010"]
})
using window functions
(
df.sort(["asset", "date"])
.with_columns([
pl.col("v").pct_change().over("asset").alias("pct_change")
])
)
using groupby + explode
(df.groupby("asset")
.agg([
pl.all().first(),
pl.col("v").sort_by("date").pct_change().alias("pct_change")
]).explode("pct_change")
)
Result
Both output:
shape: (8, 4)
┌───────┬─────┬──────┬────────────┐
│ asset ┆ v ┆ date ┆ pct_change │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str ┆ f64 │
╞═══════╪═════╪══════╪════════════╡
│ a ┆ 4 ┆ 2013 ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 1 ┆ 2017 ┆ -0.75 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 2011 ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 5 ┆ 2014 ┆ 1.5 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 7 ┆ 2016 ┆ 0.4 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 8 ┆ 2010 ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 2012 ┆ -0.625 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 6 ┆ 2015 ┆ 1.0 │
└───────┴─────┴──────┴────────────┘

How to assign column values based on another column iteratively with Polars

For these two dfs, I want to check for each i in df1["TS"] if df["TS"] == df1["TS}, then assign the value in "Dr" that corresponds to i to the "mmsi" column:
df = pl.DataFrame({"TS": [1, 2, 3, 4, 5, 6, 7], "mmsi":[11,12,13,14,15,16,17]})
df1 = pl.DataFrame({
"TS": [4, 6, 7],
"Dr": [21,22,23]})
I want the output of df["mmsi"] to be: [11,12,13,21,15,22,23]
I suggest using a "left" join, followed by a fill_null, to fill in values of Dr that are not found.
df.join(
df1,
on="TS",
how="left"
).with_column(pl.col('Dr').fill_null(pl.col('mmsi')))
shape: (7, 3)
┌─────┬──────┬─────┐
│ TS ┆ mmsi ┆ Dr │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪══════╪═════╡
│ 1 ┆ 11 ┆ 11 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 12 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 13 ┆ 13 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 4 ┆ 14 ┆ 21 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 5 ┆ 15 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 6 ┆ 16 ┆ 22 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
│ 7 ┆ 17 ┆ 23 │
└─────┴──────┴─────┘
Your result is in the Dr column. If needed, you can drop/rename columns so that mmsi is the final column.
df = (
df.join(df1, on="TS", how="left")
.with_column(pl.col("Dr").fill_null(pl.col("mmsi")))
.drop("mmsi")
.rename({"Dr": "mmsi"})
)
print(df)
shape: (7, 2)
┌─────┬──────┐
│ TS ┆ mmsi │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪══════╡
│ 1 ┆ 11 │
├╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3 ┆ 13 │
├╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 4 ┆ 21 │
├╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 5 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 6 ┆ 22 │
├╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 7 ┆ 23 │
└─────┴──────┘
Taken in steps, the "left" join will yield the following.
df.join(
df1,
on="TS",
how="left"
)
shape: (7, 3)
┌─────┬──────┬──────┐
│ TS ┆ mmsi ┆ Dr │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪══════╪══════╡
│ 1 ┆ 11 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2 ┆ 12 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3 ┆ 13 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 4 ┆ 14 ┆ 21 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 5 ┆ 15 ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 6 ┆ 16 ┆ 22 │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 7 ┆ 17 ┆ 23 │
└─────┴──────┴──────┘
The fill_null step will then fill in any missing values in the Dr column using the corresponding values in the mmsi column.
The performance of this will be much better than iterating over values using a for loop.

Categories

Resources