I have two polars dataframe, one dataframe df_1 with two columns start and end the other dataframe df_2 one with a column dates and I want to do a left join on df_2 under the condition that the dates column is in between the start and end column.
To make it more obvious what I want to do here is an example
DATA
import polars as pl
from datetime import date
df_1 = pl.DataFrame(
{
"id": ["abc", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 2, 4), date(2022, 3, 10), date(2022, 5, 16)],
"value": [10, 3, 4]
}
)
df_2 = pl.DataFrame(
{
"id": ["abc", "abc", "456", "abc", "abc", "456"],
"dates": [date(2022, 1, 2), date(2022, 3, 4), date(2022, 5, 11), date(2022, 1, 4), date(2022, 3, 7), date(2022, 5, 13)],
}
)
So now I would join on id and that dates is in between start and end and the result should look like that
RESULT
shape: (6, 3)
┌─────┬────────────┬───────┐
│ id ┆ dates ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ abc ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-07 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
└─────┴────────────┴───────┘
(I'm going to assume that your intervals in df_1 do not overlap for a particular id - otherwise, there may not be a unique value that we can assign to the id/dates combinations in df_2.)
One way to do this is with join_asof.
The Algorithm
(
df_2
.sort("dates")
.join_asof(
df_1.sort("start"),
by="id",
left_on="dates",
right_on="start",
strategy="backward",
)
.with_column(
pl.when(pl.col('dates') <= pl.col('end'))
.then(pl.col('value'))
.otherwise(None)
)
.select(['id', 'dates', 'value'])
)
shape: (6, 3)
┌─────┬────────────┬───────┐
│ id ┆ dates ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ abc ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-07 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
└─────┴────────────┴───────┘
In Steps
First, let's append some additional rows to df_2, to show what will happen if a particular row is not contained in an interval in df_1. I'll also add a row number, for easier inspection.
df_2 = pl.DataFrame(
{
"id": ["abc", "abc", "456", "abc", "abc", "456", "abc", "abc", "abc"],
"dates": [
date(2022, 1, 2),
date(2022, 3, 4),
date(2022, 5, 11),
date(2022, 1, 4),
date(2022, 3, 7),
date(2022, 5, 13),
date(2021, 12, 31),
date(2022, 3, 1),
date(2023, 1, 1),
],
}
).with_row_count()
df_2
shape: (9, 3)
┌────────┬─────┬────────────┐
│ row_nr ┆ id ┆ dates │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date │
╞════════╪═════╪════════════╡
│ 0 ┆ abc ┆ 2022-01-02 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ abc ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 456 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ abc ┆ 2022-01-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ abc ┆ 2022-03-07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 456 ┆ 2022-05-13 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ abc ┆ 2021-12-31 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ abc ┆ 2022-03-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ abc ┆ 2023-01-01 │
└────────┴─────┴────────────┘
The join_asof step finds the latest start date that is on or before the dates date. Since intervals do not overlap, this is the only interval that might contain the dates date.
For our purposes, I'll make a copy of the start column so that we can inspect the results. (The start column will not be in the results of the join_asof.)
Note that for a join_asof, both DataFrames must be sorted by the asof columns (dates and start in this case).
(
df_2
.sort("dates")
.join_asof(
df_1.sort("start").with_column(pl.col("start").alias("start_df1")),
by="id",
left_on="dates",
right_on="start",
strategy="backward",
)
.sort("row_nr")
)
shape: (9, 6)
┌────────┬─────┬────────────┬────────────┬───────┬────────────┐
│ row_nr ┆ id ┆ dates ┆ end ┆ value ┆ start_df1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date ┆ date ┆ i64 ┆ date │
╞════════╪═════╪════════════╪════════════╪═══════╪════════════╡
│ 0 ┆ abc ┆ 2022-01-02 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ abc ┆ 2022-03-04 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ abc ┆ 2022-01-04 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ abc ┆ 2022-03-07 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 456 ┆ 2022-05-13 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ abc ┆ 2021-12-31 ┆ null ┆ null ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ abc ┆ 2022-03-01 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ abc ┆ 2023-01-01 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
└────────┴─────┴────────────┴────────────┴───────┴────────────┘
The last three rows are the ones that I added.
In the last step, we'll inspect the end date, and null out any values where dates is beyond end.
(
df_2
.sort("dates")
.join_asof(
df_1.sort("start").with_column(pl.col("start").alias("start_df1")),
by="id",
left_on="dates",
right_on="start",
strategy="backward",
)
.with_column(
pl.when(pl.col('dates') <= pl.col('end'))
.then(pl.col('value'))
.otherwise(None)
)
.sort("row_nr")
)
shape: (9, 6)
┌────────┬─────┬────────────┬────────────┬───────┬────────────┐
│ row_nr ┆ id ┆ dates ┆ end ┆ value ┆ start_df1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ date ┆ date ┆ i64 ┆ date │
╞════════╪═════╪════════════╪════════════╪═══════╪════════════╡
│ 0 ┆ abc ┆ 2022-01-02 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ abc ┆ 2022-03-04 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ abc ┆ 2022-01-04 ┆ 2022-02-04 ┆ 10 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ abc ┆ 2022-03-07 ┆ 2022-03-10 ┆ 3 ┆ 2022-03-04 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 456 ┆ 2022-05-13 ┆ 2022-05-16 ┆ 4 ┆ 2022-05-11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ abc ┆ 2021-12-31 ┆ null ┆ null ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ abc ┆ 2022-03-01 ┆ 2022-02-04 ┆ null ┆ 2022-01-01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ abc ┆ 2023-01-01 ┆ 2022-03-10 ┆ null ┆ 2022-03-04 │
└────────┴─────┴────────────┴────────────┴───────┴────────────┘
You can see that the last three rows that I added (which purposely don't match any intervals in df_1) have null as value.
Instead of using when/then/otherwise to set value to null, you can filter these out, if that's what you need.
Related
how change from pandas to polars this:
self.df.loc[
(self.df['p23'] > z) &
(self.df['posn'] > 1) &
(self.df['ncL'] > y) ,
'tradeL'] = 1
┌────────────┬──────────┬──────────┬──────┬─────┬─────┬─────┬────────┬────────┐
│ tms ┆ p23 ┆ m23 ┆ posn ┆ ... ┆ ncL ┆ ncS ┆ tradeL ┆ tradeS │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ i64 ┆ ┆ i32 ┆ i32 ┆ i64 ┆ i64 │
╞════════════╪══════════╪══════════╪══════╪═════╪═════╪═════╪════════╪════════╡
│ 1667551500 ┆ 3.582475 ┆ 0.654813 ┆ 23 ┆ ... ┆ 16 ┆ 0 ┆ 0 ┆ 0 │
│ 1672989300 ┆ 2.727239 ┆ 1.510049 ┆ 31 ┆ ... ┆ 3 ┆ 3 ┆ 0 ┆ 0 │
│ 1673215200 ┆ 0.0 ┆ 0.0 ┆ 0 ┆ ... ┆ 0 ┆ 0 ┆ 0 ┆ 0 │
│ 1665506700 ┆ 0.236804 ┆ 4.000484 ┆ 0 ┆ ... ┆ 0 ┆ 26 ┆ 0 ┆ 0 │
│ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... │
│ 1669979700 ┆ 0.437209 ┆ 3.800079 ┆ 6 ┆ ... ┆ 0 ┆ 13 ┆ 0 ┆ 0 │
│ 1669709700 ┆ 2.204868 ┆ 2.03242 ┆ 10 ┆ ... ┆ 12 ┆ 5 ┆ 0 ┆ 0 │
│ 1671648300 ┆ 3.27545 ┆ 0.961838 ┆ 30 ┆ ... ┆ 10 ┆ 0 ┆ 0 ┆ 0 │
│ 1668403800 ┆ 0.0 ┆ 0.0 ┆ 0 ┆ ... ┆ 0 ┆ 0 ┆ 0 ┆ 0 │
└────────────┴──────────┴──────────┴──────┴─────┴─────┴─────┴────────┴────────┘
I not found, how to change value in the polars table. All example only show output.
I'm expecting something like this:
dfOut.select(
pl.when((pl.col("p23")>2)
& (pl.col("posn")>1)
& (pl.col("ncL")>2)).then(dfOut["tradeL"]=1))
A | B | C | O
-----------
1 | 0 | 0 | 0 | -> False
1 | 1 | 1 | ? | -> True <- column "O" & this row change value
1 | 0 | 1 | 0 | -> False
like this...
df["O"].when((A==1)&(B==1)&(C==1)).than(1).other(0)
QA: how to change in column O value, if all [A,B,C] is 1
SOLUTION (thanks jqurious):
df=df.with_columns(pl.when((pl.col("A")==1)& (pl.col("B")==1)& (pl.col("C")==1)).then(1).otherwise(pl.col("O")).alias("O"))
I'd like to calculate aggregated metrics with an expanding window. Basically, given the following dataframe:
from datetime import date
import polars as pl
df = pl.DataFrame({"Day":[date(2022, 1, i) for i in range(1,10)], "value":[1,2,3,4,5,6,7,8,9]})
shape: (9, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-04 ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-07 ┆ 7 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-09 ┆ 9 │
└────────────┴───────┘
What I'm after is basically this:
|--|
|-----|
|--------|
I tried to use groupby_rolling and groupby_dynamic, but I couldn't get it to fix the initial time of each group to the first timestamp. My current workaround is something like this:
date_range = pl.date_range(df.select("Day").min().row(0)[0], df.select("Day").max().row(0)[0], '1w',)
for timestamp in date_range:
print(df.filter(pl.col('Day').is_between(date_range[0], timestamp, include_bounds=True)))
shape: (1, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
└────────────┴───────┘
shape: (8, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-04 ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-05 ┆ 5 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 │
...
│ 2022-01-07 ┆ 7 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 │
└────────────┴───────┘
This gives me the exact aggregation I'm after, but I feel like there's a much more efficient way of doing this - and I'd especially like to do my aggregations within a groupby context.
from datetime import date
import polars as pl
df = pl.DataFrame({"Day":[date(2022, 1, i) for i in range(1,10)], "value":[1,2,3,4,5,6,7,8,9]})
shape: (9, 2)
┌────────────┬───────┐
│ Day ┆ value │
│ --- ┆ --- │
│ date ┆ i64 │
╞════════════╪═══════╡
│ 2022-01-01 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-04 ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-07 ┆ 7 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-09 ┆ 9 │
└────────────┴───────┘
Not sure if it's possible with dynamic/rolling but you could create a dataframe from your date range and do a cross join.
>>> start = df.get_column("Day").min()
... end = df.get_column("Day").max()
... date_range = (
... pl.date_range(start, end, interval="1w").to_frame("end")
... .with_row_count(name="group")
... )
>>> date_range
shape: (2, 2)
┌───────┬────────────┐
│ group ┆ end │
│ --- ┆ --- │
│ u32 ┆ date │
╞═══════╪════════════╡
│ 0 ┆ 2022-01-01 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2022-01-08 │
└───────┴────────────┘
You can then run your filter and be left with a group identifier:
>>> (
... df
... .join(date_range, left_on="Day", right_on="end", how="cross")
... .with_column(pl.lit(start).alias("start"))
... .filter(
... pl.col("Day").is_between(
... pl.col("start"),
... pl.col("end"),
... include_bounds=True))
... .drop(["start", "end"])
... )
shape: (9, 3)
┌────────────┬───────┬───────┐
│ Day ┆ value ┆ group │
│ --- ┆ --- ┆ --- │
│ date ┆ i64 ┆ u32 │
╞════════════╪═══════╪═══════╡
│ 2022-01-01 ┆ 1 ┆ 0 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-01 ┆ 1 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-02 ┆ 2 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-03 ┆ 3 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-05 ┆ 5 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-06 ┆ 6 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-07 ┆ 7 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2022-01-08 ┆ 8 ┆ 1 │
└────────────┴───────┴───────┘
I have a small use case and here is a polars dataframe.
df_names = pl.DataFrame({'LN'['Mallesham','Bhavik','Mallesham','Bhavik','Mahesh','Naresh','Sharath','Rakesh','Mallesham'],
'FN':['Yamulla','Yamulla','Yamulla','Yamulla','Dayala','Burre','Velmala','Uppu','Yamulla'],
'SSN':['123','456','123','456','893','111','222','333','123'],
'Address':['A','B','C','D','E','F','G','H','S']})
Here I would like to group on LN,FN,SSN and create a new column in which how many number of observations for this group combination and below is the expected output.
'Mallesham','Yamulla','123' is appeared 3 times, hence LN_FN_SSN_count field is filled up with 3.
You can use an expression using over (which is like grouping, aggregating and self-joining in other libs, but without the need for the join):
df_names.with_column(pl.count().over(['LN', 'FN', 'SSN']).alias('LN_FN_SSN_count'))
┌───────────┬─────────┬─────┬─────────┬─────────────────┐
│ LN ┆ FN ┆ SSN ┆ Address ┆ LN_FN_SSN_count │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ u32 │
╞═══════════╪═════════╪═════╪═════════╪═════════════════╡
│ Mallesham ┆ Yamulla ┆ 123 ┆ A ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Bhavik ┆ Yamulla ┆ 456 ┆ B ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Mallesham ┆ Yamulla ┆ 123 ┆ C ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Bhavik ┆ Yamulla ┆ 456 ┆ D ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Naresh ┆ Burre ┆ 111 ┆ F ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Sharath ┆ Velmala ┆ 222 ┆ G ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Rakesh ┆ Uppu ┆ 333 ┆ H ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Mallesham ┆ Yamulla ┆ 123 ┆ S ┆ 3 │
└───────────┴─────────┴─────┴─────────┴─────────────────┘
I am trying to create a list of new columns based on the latest column. I can achieve this by using with_columns() and simple multiplication. Given I want a long list of new columns, I am thinking to use a loop with an f-string to do it. However, I am not so sure how to apply f-string into polars column names.
df = pl.DataFrame(
{
"id": ["NY", "TK", "FD"],
"eat2003": [-9, 3, 8],
"eat2004": [10, 11, 8]
}
); df
┌─────┬─────────┬─────────┐
│ id ┆ eat2003 ┆ eat2004 │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╡
│ NY ┆ -9 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 3 ┆ 11 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 8 ┆ 8 │
└─────┴─────────┴─────────┘
(
df
.with_columns((pl.col('eat2004') * 2).alias('eat2005'))
.with_columns((pl.col('eat2005') * 2).alias('eat2006'))
.with_columns((pl.col('eat2006') * 2).alias('eat2007'))
)
Expected output:
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2003 ┆ eat2004 ┆ eat2005 ┆ eat2006 ┆ eat2007 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ -9 ┆ 10 ┆ 20 ┆ 40 ┆ 80 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 3 ┆ 11 ┆ 22 ┆ 44 ┆ 88 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 8 ┆ 8 ┆ 16 ┆ 32 ┆ 64 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┘
If you can base each of the newest columns from eat2004, I would suggest the following approach:
expr_list = [
(pl.col('eat2004') * (2**i)).alias(f"eat{2004 + i}")
for i in range(1, 8)
]
(
df
.with_columns(expr_list)
)
shape: (3, 10)
┌─────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ id ┆ eat2003 ┆ eat2004 ┆ eat2005 ┆ eat2006 ┆ eat2007 ┆ eat2008 ┆ eat2009 ┆ eat2010 ┆ eat2011 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ NY ┆ -9 ┆ 10 ┆ 20 ┆ 40 ┆ 80 ┆ 160 ┆ 320 ┆ 640 ┆ 1280 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ TK ┆ 3 ┆ 11 ┆ 22 ┆ 44 ┆ 88 ┆ 176 ┆ 352 ┆ 704 ┆ 1408 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ FD ┆ 8 ┆ 8 ┆ 16 ┆ 32 ┆ 64 ┆ 128 ┆ 256 ┆ 512 ┆ 1024 │
└─────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┴─────────┘
As long as all the Expressions are independent of each other, we can run them in parallel in a single with_columns context (for a nice performance gain). However, if the Expressions are not independent, then they must be run each in successive with_column contexts.
I've purposely created the list of Expressions outside of any query context to demonstrate that Expressions can be generated independent of any query. Later, the list can be supplied to with_columns. This approach helps with debugging and keeping code clean, as you build and test your Expressions.
I have a polars dataframe with two date columns that represent a start and end date and then a value that I want to repeat for all dates in between those two dates so that I can join those on other tables.
Example input is
id
start
end
value
123
2022-01-01
2022-01-04
10
abc
2022-03-04
2022-03-04
3
456
2022-05-11
2022-05-16
4
and expected output is
id
date
value
123
2022-01-01
10
123
2022-01-02
10
123
2022-01-03
10
123
2022-01-04
10
abc
2022-03-04
3
456
2022-05-11
4
456
2022-05-12
4
456
2022-05-13
4
456
2022-05-14
4
456
2022-05-15
4
456
2022-05-16
4
I struggled today with the same problem and I thought I share my solution.
As cbilot already mentions pl.dat_range doesn't take expressions as low and high value. So I worked around by using apply.
Data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
Solution:
(
df.with_columns(
[(pl.struct(["start", "end"])
.apply(lambda x: pl.date_range(x["start"], x["end"], "1d"))
.alias("date"))])
.explode(pl.col("date"))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
Starting with this data:
import polars as pl
from datetime import date
df = pl.DataFrame(
{
"id": ["123", "abc", "456"],
"start": [date(2022, 1, 1), date(2022, 3, 4), date(2022, 5, 11)],
"end": [date(2022, 1, 4), date(2022, 3, 4), date(2022, 5, 16)],
"value": [10, 3, 4],
}
)
df
shape: (3, 4)
┌─────┬────────────┬────────────┬───────┐
│ id ┆ start ┆ end ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 │
╞═════╪════════════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴────────────┴───────┘
The Algorithm
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘
In steps
Normally, we create a range of dates using the date_range expression. However, date_range does not take an Expression as its low and high parameters.
However, arange does allow Expressions as its low and high parameters. We can (implicitly) cast the start and end dates to integers, which represent the number of days since the UNIX epoch.
The result is a list of integers which represents the days between (and including) the start and end dates (expressed as days since the UNIX epoch)..
Notice that we have to add 1 to the high parameter to make sure we capture the end date.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
)
shape: (3, 5)
┌─────┬────────────┬────────────┬───────┬───────────────────────────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ list[i64] │
╞═════╪════════════╪════════════╪═══════╪═══════════════════════════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ [18993, 18994, ... 18996] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ [19055] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ [19123, 19124, ... 19128] │
└─────┴────────────┴────────────┴───────┴───────────────────────────┘
Next we can use explode to place each of the integers on a separate row.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
)
shape: (11, 5)
┌─────┬────────────┬────────────┬───────┬───────┐
│ id ┆ start ┆ end ┆ value ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ date ┆ date ┆ i64 ┆ i64 │
╞═════╪════════════╪════════════╪═══════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18993 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18994 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18995 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-01 ┆ 2022-01-04 ┆ 10 ┆ 18996 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 2022-03-04 ┆ 3 ┆ 19055 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19123 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19124 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19125 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19126 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19127 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 2022-05-16 ┆ 4 ┆ 19128 │
└─────┴────────────┴────────────┴───────┴───────┘
The final step is to cast the date column back to a pl.Date, and then select only the columns that we want.
(
df.with_columns(
[pl.arange(pl.col("start"), pl.col("end") + 1).alias("date")])
.explode("date")
.with_column(pl.col("date").cast(pl.Date))
.select(["id", "date", "value"])
)
shape: (11, 3)
┌─────┬────────────┬───────┐
│ id ┆ date ┆ value │
│ --- ┆ --- ┆ --- │
│ str ┆ date ┆ i64 │
╞═════╪════════════╪═══════╡
│ 123 ┆ 2022-01-01 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-02 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-03 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 123 ┆ 2022-01-04 ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ abc ┆ 2022-03-04 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-11 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-12 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-13 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-14 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-15 ┆ 4 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 456 ┆ 2022-05-16 ┆ 4 │
└─────┴────────────┴───────┘