What is the best way to find the differences between 2 Polars dataframes?
The frame_equal method tells me if there is a difference, I want to find where is the difference.
Example:
import polars as pl
df1 = pl.DataFrame([
{'id': 1,'col1': ['a',None],'col2': ['x']},
{'id': 2,'col1': ['b'],'col2': ['y', None]},
{'id': 3,'col1': [None],'col2': ['z']}]
)
┌─────┬─────────────┬─────────────┐
│ id ┆ col1 ┆ col2 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ list[str] ┆ list[str] │
╞═════╪═════════════╪═════════════╡
│ 1 ┆ ["a", null] ┆ ["x"] │
│ 2 ┆ ["b"] ┆ ["y", null] │
│ 3 ┆ [null] ┆ ["z"] │
└─────┴─────────────┴─────────────┘
df2 = pl.DataFrame([
{'id': 1,'col1': ['a'],'col2': ['x']},
{'id': 2,'col1': ['b', None],'col2': ['y', None]},
{'id': 3,'col1': [None],'col2': ['z']}]
)
┌─────┬─────────────┬─────────────┐
│ id ┆ col1 ┆ col2 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ list[str] ┆ list[str] │
╞═════╪═════════════╪═════════════╡
│ 1 ┆ ["a"] ┆ ["x"] │
│ 2 ┆ ["b", null] ┆ ["y", null] │
│ 3 ┆ [null] ┆ ["z"] │
└─────┴─────────────┴─────────────┘
The difference in the example is for id = 1 and id = 2.
I can join the dataframes:
df1.join(df2, on='id', suffix='_df2')
┌─────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ id ┆ col1 ┆ col2 ┆ col1_df2 ┆ col2_df2 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ list[str] ┆ list[str] ┆ list[str] ┆ list[str] │
╞═════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ 1 ┆ ["a", null] ┆ ["x"] ┆ ["a"] ┆ ["x"] │
│ 2 ┆ ["b"] ┆ ["y", null] ┆ ["b", null] ┆ ["y", null] │
│ 3 ┆ [null] ┆ ["z"] ┆ [null] ┆ ["z"] │
└─────┴─────────────┴─────────────┴─────────────┴─────────────┘
Expected result
I would like to either:
add a boolean columns that shows True in the rows with a difference
filter and only display rows with a difference.
The example has only 2 columns, but there are more columns in the dataframe.
Here's the filter approach
df1.join(df2, on='id', suffix='_df2') \
.filter(pl.any([pl.col(x)!=pl.col(f"{x}_df2") for x in df1.columns if x!='id']))
If you wanted the bool column then you just change the filter to with_columns and add an alias.
df1.join(df2, on='id', suffix='_df2') \
.with_columns(pl.any([pl.col(x)!=pl.col(f"{x}_df2") for x in df1.columns if x!='id']).alias("has_diff"))
This assumes that each df has all the same columns other than 'id'.
You can also use == to compare frames:
>>> df1 == df2
shape: (3, 3)
┌──────┬───────┬──────┐
│ id | col1 | col2 │
│ --- | --- | --- │
│ bool | bool | bool │
╞══════╪═══════╪══════╡
│ true | false | true │
│ true | false | true │
│ true | true | true │
└──────┴───────┴──────┘
Related
I have a dataframe as-
pl.DataFrame({'last_name':['Unknown','Mallesham',np.nan,'Bhavik','Unknown'],
'first_name_or_initial':['U',np.nan,'TRUE','yamulla',np.nan],
'number':['003123490','012457847','100030303','','0023004648'],
'date_of_birth':[np.nan,'12/09/1900','12/09/1900','12/09/1900',np.nan]})
Here I would like to add a new column which contains the field names that do hold on any information except NULL/EMPTY/NAN.
For example:
first row: it has last,first and number field information, and dob is NULL, hence a new column conso_field is filled in with these field names such as last_name,first_name_or_initial and number. like wise I need to get this done for all the rows.
Here is an expected output:
First, let's expand the example to show a row with all null/empty fields (to show how the algorithm handles this case).
import polars as pl
import numpy as np
df = pl.DataFrame(
{
"last_name": ["Unknown", "Mallesham", np.nan, "Bhavik", "Unknown", None],
"first_name_or_initial": ["U", np.nan, "TRUE", "yamulla", np.nan, None],
"number": ["003123490", "012457847", "100030303", "", "0023004648", None],
"date_of_birth": [np.nan, "12/09/1900", "12/09/1900", "12/09/1900", np.nan, None],
}
)
df
shape: (6, 4)
┌───────────┬───────────────────────┬────────────┬───────────────┐
│ last_name ┆ first_name_or_initial ┆ number ┆ date_of_birth │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str │
╞═══════════╪═══════════════════════╪════════════╪═══════════════╡
│ Unknown ┆ U ┆ 003123490 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Mallesham ┆ null ┆ 012457847 ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ TRUE ┆ 100030303 ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Bhavik ┆ yamulla ┆ ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Unknown ┆ null ┆ 0023004648 ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ null ┆ null │
└───────────┴───────────────────────┴────────────┴───────────────┘
The Algorithm
df = df.with_row_count()
(
df
.join(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
.groupby('row_nr')
.agg(pl.col('variable').alias('conso_field'))
.with_column(pl.col('conso_field').arr.join(','))
,
on='row_nr',
how='left'
)
)
shape: (6, 6)
┌────────┬───────────┬───────────────────────┬────────────┬───────────────┬─────────────────────────────────────┐
│ row_nr ┆ last_name ┆ first_name_or_initial ┆ number ┆ date_of_birth ┆ conso_field │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str ┆ str ┆ str ┆ str │
╞════════╪═══════════╪═══════════════════════╪════════════╪═══════════════╪═════════════════════════════════════╡
│ 0 ┆ Unknown ┆ U ┆ 003123490 ┆ null ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ Mallesham ┆ null ┆ 012457847 ┆ 12/09/1900 ┆ last_name,number,date_of_birth │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null ┆ TRUE ┆ 100030303 ┆ 12/09/1900 ┆ first_name_or_initial,number,dat... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ Bhavik ┆ yamulla ┆ ┆ 12/09/1900 ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ Unknown ┆ null ┆ 0023004648 ┆ null ┆ last_name,number │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ null ┆ null ┆ null ┆ null ┆ null │
└────────┴───────────┴───────────────────────┴────────────┴───────────────┴─────────────────────────────────────┘
Note that the algorithm keeps the last row with all null/empty values.
How it works
To see how it works, let's take it in steps.
First, we'll need to attach a row number to each row. (This is needed in case any row has all null/empty values.)
Then we'll use melt to place each value in each column on a separate row, next to it's column name.
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
)
shape: (24, 3)
┌────────┬───────────────┬────────────┐
│ row_nr ┆ variable ┆ value │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str │
╞════════╪═══════════════╪════════════╡
│ 0 ┆ last_name ┆ Unknown │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ last_name ┆ Mallesham │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ last_name ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ last_name ┆ Bhavik │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ date_of_birth ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ date_of_birth ┆ null │
└────────┴───────────────┴────────────┘
Note that columns values will be converted to string values in this step.
Next, we'll filter out any rows with null or "" values.
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
)
shape: (14, 3)
┌────────┬───────────────┬────────────┐
│ row_nr ┆ variable ┆ value │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str │
╞════════╪═══════════════╪════════════╡
│ 0 ┆ last_name ┆ Unknown │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ last_name ┆ Mallesham │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ last_name ┆ Bhavik │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ last_name ┆ Unknown │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ number ┆ 0023004648 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ date_of_birth ┆ 12/09/1900 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ date_of_birth ┆ 12/09/1900 │
└────────┴───────────────┴────────────┘
In the next step, we'll aggregate up all the remaining rows by row number, keeping only the column names. These represent columns with non-null, non-empty values.
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
.groupby('row_nr')
.agg(pl.col('variable').alias('conso_field'))
)
shape: (5, 2)
┌────────┬─────────────────────────────────────┐
│ row_nr ┆ conso_field │
│ --- ┆ --- │
│ u32 ┆ list[str] │
╞════════╪═════════════════════════════════════╡
│ 2 ┆ ["first_name_or_initial", "numbe... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ ["last_name", "first_name_or_ini... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ ["last_name", "number"] │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0 ┆ ["last_name", "first_name_or_ini... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ ["last_name", "number", "date_of... │
└────────┴─────────────────────────────────────┘
Note that we get a list of column names for each row. (Note: we don't need to worry about the order of the rows at this point. We'll use the row number and a left-join in the last step recombine the values to the original DataFrame.)
Then, it's simply a matter of joining the columns names into one string:
df = df.with_row_count()
(
df
.melt(id_vars="row_nr")
.filter(pl.col('value').is_not_null() & (pl.col('value') != ""))
.groupby('row_nr')
.agg(pl.col('variable').alias('conso_field'))
.with_column(pl.col('conso_field').arr.join(','))
)
shape: (5, 2)
┌────────┬─────────────────────────────────────┐
│ row_nr ┆ conso_field │
│ --- ┆ --- │
│ u32 ┆ str │
╞════════╪═════════════════════════════════════╡
│ 3 ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ first_name_or_initial,number,dat... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ last_name,number │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0 ┆ last_name,first_name_or_initial,... │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ last_name,number,date_of_birth │
└────────┴─────────────────────────────────────┘
From here, we simply use a "left join" to merge the data back to the original dataset (as shown at the beginning.)
In pandas, the following code will split the string from col1 into many columns. is there a way to do this in polars?
d = {'col1': ["a/b/c/d", "a/b/c/d"]}
df= pd.DataFrame(data=d)
df[["a","b","c","d"]]=df["col1"].str.split('/',expand=True)
Here's an algorithm that will automatically adjust for the required number of columns -- and should be quite performant.
Let's start with this data. Notice that I've purposely added the empty string "" and a null value - to show how the algorithm handles these values. Also, the number of split strings varies widely.
import polars as pl
df = pl.DataFrame(
{
"my_str": ["cat", "cat/dog", None, "", "cat/dog/aardvark/mouse/frog"],
}
)
df
shape: (5, 1)
┌─────────────────────────────┐
│ my_str │
│ --- │
│ str │
╞═════════════════════════════╡
│ cat │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cat/dog │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ cat/dog/aardvark/mouse/frog │
└─────────────────────────────┘
The Algorithm
The algorithm below may be a bit more than you need, but you can edit/delete/add as you need.
(
df
.with_row_count('id')
.with_column(pl.col("my_str").str.split("/").alias("split_str"))
.explode("split_str")
.with_column(
("string_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
.over("id")
.alias("col_nm")
)
.pivot(
index=['id', 'my_str'],
values='split_str',
columns='col_nm',
)
.with_column(
pl.col('^string_.*$').fill_null("")
)
)
shape: (5, 7)
┌─────┬─────────────────────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ id ┆ my_str ┆ string_00 ┆ string_01 ┆ string_02 ┆ string_03 ┆ string_04 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str ┆ str ┆ str ┆ str ┆ str │
╞═════╪═════════════════════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0 ┆ cat ┆ cat ┆ ┆ ┆ ┆ │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ cat/dog ┆ cat ┆ dog ┆ ┆ ┆ │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ ┆ ┆ ┆ ┆ ┆ │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ cat ┆ dog ┆ aardvark ┆ mouse ┆ frog │
└─────┴─────────────────────────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
How it works
We first assign a row number id (which we'll need later), and use split to separate the strings. Note that the split strings form a list.
(
df
.with_row_count('id')
.with_column(pl.col("my_str").str.split("/").alias("split_str"))
)
shape: (5, 3)
┌─────┬─────────────────────────────┬────────────────────────────┐
│ id ┆ my_str ┆ split_str │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ list[str] │
╞═════╪═════════════════════════════╪════════════════════════════╡
│ 0 ┆ cat ┆ ["cat"] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ cat/dog ┆ ["cat", "dog"] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ ┆ [""] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ ["cat", "dog", ... "frog"] │
└─────┴─────────────────────────────┴────────────────────────────┘
Next, we'll use explode to put each string on its own row. (Notice how the id column tracks the original row that each string came from.)
(
df
.with_row_count('id')
.with_column(pl.col("my_str").str.split("/").alias("split_str"))
.explode("split_str")
)
shape: (10, 3)
┌─────┬─────────────────────────────┬───────────┐
│ id ┆ my_str ┆ split_str │
│ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str │
╞═════╪═════════════════════════════╪═══════════╡
│ 0 ┆ cat ┆ cat │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ cat/dog ┆ cat │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ cat/dog ┆ dog │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ ┆ │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ cat │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ dog │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ aardvark │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ mouse │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ frog │
└─────┴─────────────────────────────┴───────────┘
In the next step, we're going to generate our column names. I chose to call each column string_XX where XX is the offset with regards to the original string.
I've used the handy zfill expression so that 1 becomes 01. (This makes sure that string_02 comes before string_10 if you decide to sort your columns later.)
You can substitute your own naming in this step as you need.
(
df
.with_row_count('id')
.with_column(pl.col("my_str").str.split("/").alias("split_str"))
.explode("split_str")
.with_column(
("string_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
.over("id")
.alias("col_nm")
)
)
shape: (10, 4)
┌─────┬─────────────────────────────┬───────────┬───────────┐
│ id ┆ my_str ┆ split_str ┆ col_nm │
│ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str ┆ str │
╞═════╪═════════════════════════════╪═══════════╪═══════════╡
│ 0 ┆ cat ┆ cat ┆ string_00 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ cat/dog ┆ cat ┆ string_00 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ cat/dog ┆ dog ┆ string_01 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null ┆ null ┆ string_00 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ ┆ ┆ string_00 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ cat ┆ string_00 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ dog ┆ string_01 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ aardvark ┆ string_02 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ mouse ┆ string_03 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ frog ┆ string_04 │
└─────┴─────────────────────────────┴───────────┴───────────┘
In the next step, we'll use the pivot function to place each string in its own column.
(
df
.with_row_count('id')
.with_column(pl.col("my_str").str.split("/").alias("split_str"))
.explode("split_str")
.with_column(
("string_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
.over("id")
.alias("col_nm")
)
.pivot(
index=['id', 'my_str'],
values='split_str',
columns='col_nm',
)
)
shape: (5, 7)
┌─────┬─────────────────────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ id ┆ my_str ┆ string_00 ┆ string_01 ┆ string_02 ┆ string_03 ┆ string_04 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ str ┆ str ┆ str ┆ str ┆ str │
╞═════╪═════════════════════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0 ┆ cat ┆ cat ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ cat/dog ┆ cat ┆ dog ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ ┆ ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ cat/dog/aardvark/mouse/frog ┆ cat ┆ dog ┆ aardvark ┆ mouse ┆ frog │
└─────┴─────────────────────────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
All that remains is to use fill_null to replace the null values with an empty string "". Notice that I've used a regex expression in the col expression to target only those columns whose names start with "string_". (Depending on your other data, you may not want to replace null with "" everywhere in your data.)
You can use apply() method
import polars as pl
from polars import col
df = pl.DataFrame({
'col1': ["a/b/c/d", "e/f/j/k"]
})
print(df)
df:
shape: (2, 1)
┌─────────┐
│ col1 │
│ --- │
│ str │
╞═════════╡
│ a/b/c/d │
├╌╌╌╌╌╌╌╌╌┤
│ e/f/j/k │
└─────────┘
With apply()
df = df.with_columns([
col('col1'),
*[col('col1').apply(lambda s, i=i: s.split('/')[i]).alias(col_name)
for i, col_name in enumerate(['a', 'b', 'c', 'd'])]
# or without 'for'
# col('col1').apply(lambda s: s.split('/')[0]).alias('a'),
# col('col1').apply(lambda s: s.split('/')[1]).alias('b'),
# col('col1').apply(lambda s: s.split('/')[2]).alias('c'),
# col('col1').apply(lambda s: s.split('/')[3]).alias('d')
])
print(df)
df:
shape: (2, 5)
┌─────────┬─────┬─────┬─────┬─────┐
│ col1 ┆ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ str │
╞═════════╪═════╪═════╪═════╪═════╡
│ a/b/c/d ┆ a ┆ b ┆ c ┆ d │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ e/f/j/k ┆ e ┆ f ┆ j ┆ k │
└─────────┴─────┴─────┴─────┴─────┘
It works, but probably there is more accurate way)
With this way you do the string split to turn col1 into a list of strings. Then you loop over the lists and use .arr.get to extract each element into a separate column
(df
.with_column(pl.col("col1").str.split("/"))
.with_columns(
[pl.col("col1").arr.get(i).alias(str(i)) for i in range(len(df[0,"col1"].split('/')))
]
)
)
One challenge is whether you will have the same number of elements in the list in each row. In this solution I've assumed you have and have taken the length of the list in the first row to do the loop.
You can use struct datatype, as described in this post: https://stackoverflow.com/a/74219166:
import pandas as pl
df = pl.DataFrame({
"my_str": ["cat", "cat/dog", None, "", "cat/dog/aardvark/mouse/frog"],
})
df.select(pl.col('my_str').str.split('/')
.arr.to_struct(n_field_strategy="max_width")).unnest('my_str')
Notice you must use n_field_strategy="max_width", otherwise, unnest() will create only 1 column.
import polars as pl
#Create new column list(can be created dynamically as well)
new_cols=['new_col1','new_col2','new_col3',.....,new_coln]
#Define expression
expr = [pl.col('col1').str.split('/').arr.get(i).alias(col)
for i,col in enumerate(new_cols)
]
#Apply Expression
df.with_columns(expr)
I have a df as follows:
with n knows at runtime.
I need to count 1 and -1 values over the rows.
Namely, I need a new df (or new columns in the old one):
Any advice?
As of polars 0.13.60, you can use polars.sum with an Expression to sum horizontally. For example, starting with this data
import polars as pl
data_frame = (
pl.DataFrame({
'col0': [1, -1, 1, -1, 1],
'col1': [1, 1, 1, 1, 1],
'col2': [-1, -1, -1, -1, -1],
'col3': [1, -1, -1, 1, 1],
})
)
data_frame
shape: (5, 4)
┌──────┬──────┬──────┬──────┐
│ col0 ┆ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪══════╡
│ 1 ┆ 1 ┆ -1 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ -1 ┆ 1 ┆ -1 ┆ -1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 1 ┆ 1 ┆ -1 ┆ -1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ -1 ┆ 1 ┆ -1 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 1 ┆ 1 ┆ -1 ┆ 1 │
└──────┴──────┴──────┴──────┘
We can sum all columns horizontally, using polars.all.
(
data_frame
.with_columns([
pl.sum(pl.all() > 0).alias('pos'),
pl.sum(pl.all() < 0).alias('neg'),
])
)
shape: (5, 6)
┌──────┬──────┬──────┬──────┬─────┬─────┐
│ col0 ┆ col1 ┆ col2 ┆ col3 ┆ pos ┆ neg │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪══════╪═════╪═════╡
│ 1 ┆ 1 ┆ -1 ┆ 1 ┆ 3 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ -1 ┆ 1 ┆ -1 ┆ -1 ┆ 1 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 1 ┆ 1 ┆ -1 ┆ -1 ┆ 2 ┆ 2 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ -1 ┆ 1 ┆ -1 ┆ 1 ┆ 2 ┆ 2 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 1 ┆ 1 ┆ -1 ┆ 1 ┆ 3 ┆ 1 │
└──────┴──────┴──────┴──────┴─────┴─────┘
How it works
The above algorithm works because Polars will upcast boolean values to unsigned integers when summing. For example, the expression pl.all() > 0 produces Expressions of type boolean.
(
data_frame
.with_columns([
(pl.all() > 0).keep_name()
])
)
shape: (5, 4)
┌───────┬──────┬───────┬───────┐
│ col0 ┆ col1 ┆ col2 ┆ col3 │
│ --- ┆ --- ┆ --- ┆ --- │
│ bool ┆ bool ┆ bool ┆ bool │
╞═══════╪══════╪═══════╪═══════╡
│ true ┆ true ┆ false ┆ true │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ false ┆ true ┆ false ┆ false │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ true ┆ true ┆ false ┆ false │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ false ┆ true ┆ false ┆ true │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ true ┆ true ┆ false ┆ true │
└───────┴──────┴───────┴───────┘
polars.sum will then convert these to unsigned integers as it sums them horizontally.
For examples of how to select only certain columns (by name, by type, by regex expression, etc...), see this StackOverflow response.
Now I have a dataframe like this:
df = pd.DataFrame({"asset":["a","b","c","a","b","c","b","c"],"v":[1,2,3,4,5,6,7,8],"date":["2017","2011","2012","2013","2014","2015","2016","2010"]})
I can calculate the pct_change by groupby and my function like this:
def fun(df):
df = df.sort_values(by="date")
df["pct_change"] = df["v"].pct_change()
return df
df = df.groupby("asset",as_index=False).apply(fun)
Now I want to know how can I get the same result by polars?
Here are two options. One using window functions, and one using groupby + explode.
You should benchmark and see which is faster on your use case.
preparing data
df = pl.DataFrame({
"asset":["a","b","c","a","b","c","b","c"],
"v":[1,2,3,4,5,6,7,8],
"date":["2017","2011","2012","2013","2014","2015","2016","2010"]
})
using window functions
(
df.sort(["asset", "date"])
.with_columns([
pl.col("v").pct_change().over("asset").alias("pct_change")
])
)
using groupby + explode
(df.groupby("asset")
.agg([
pl.all().first(),
pl.col("v").sort_by("date").pct_change().alias("pct_change")
]).explode("pct_change")
)
Result
Both output:
shape: (8, 4)
┌───────┬─────┬──────┬────────────┐
│ asset ┆ v ┆ date ┆ pct_change │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str ┆ f64 │
╞═══════╪═════╪══════╪════════════╡
│ a ┆ 4 ┆ 2013 ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ a ┆ 1 ┆ 2017 ┆ -0.75 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 2011 ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 5 ┆ 2014 ┆ 1.5 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 7 ┆ 2016 ┆ 0.4 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 8 ┆ 2010 ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 2012 ┆ -0.625 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 6 ┆ 2015 ┆ 1.0 │
└───────┴─────┴──────┴────────────┘
In polars pandas want to inter change/ assign , & inter change row value in two rows.
for i in range(len(k2)):
k2['column1'][i] == k2['column2'][i]
k2['column3'][i] == k2['column4'][i]
You can use alias to copy & rename columns:
import polars as pl
k2 = pl.DataFrame({"column1": [1,2,3],
"column2": [4,5,6],
"column3": [7,8,9],
"column4": [10,11,12]})
┌─────────┬─────────┬─────────┬─────────┐
│ column1 ┆ column2 ┆ column3 ┆ column4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════════╪═════════╪═════════╪═════════╡
│ 1 ┆ 4 ┆ 7 ┆ 10 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 5 ┆ 8 ┆ 11 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 6 ┆ 9 ┆ 12 │
└─────────┴─────────┴─────────┴─────────┘
k2.with_columns([pl.col("column2").alias("column1"), pl.col("column4").alias("column3")])
which prints out
┌─────────┬─────────┬─────────┬─────────┐
│ column1 ┆ column2 ┆ column3 ┆ column4 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════════╪═════════╪═════════╪═════════╡
│ 4 ┆ 4 ┆ 10 ┆ 10 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 5 ┆ 11 ┆ 11 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 6 ┆ 12 ┆ 12 │
└─────────┴─────────┴─────────┴─────────┘