Pyspark: replace row value by another column with the same name - python

I have a pyspark dataframe as below, df
| D1 | D2 | D3 |Out|
| 2 | 4 | 5 |D2 |
| 5 | 8 | 4 |D3 |
| 3 | 7 | 8 |D1 |
And I would like to replace the row values of the "out" column by the row value within the same row with the same column name of the row value of the "out" column.
| D1 | D2 | D3 |Out|Result|
| 2 | 4 | 5 |D2 |4 |
| 5 | 8 | 4 |D3 |4 |
| 3 | 7 | 8 |D1 |3 |
df_lag=df.rdd.map(lambda row: row + (row[row.Out],)).toDF(df.columns + ["Result"])
I have tried the code above and it could obtain the result but when I tried to save to csv, it keeps showing the error "Job aborted due to......" so I would like to ask if there is any other method could also obtain the same result. Thanks!

You can use chained when statements generated dynamically from the column names using reduce:
from functools import reduce
import pyspark.sql.functions as F
df2 = df.withColumn(
'Result',
reduce(
lambda x, y: x.when(F.col('Out') == y, F.col(y)),
df.columns[:-1],
F
)
)
df2.show()
+---+---+---+---+------+
| D1| D2| D3|Out|Result|
+---+---+---+---+------+
| 2| 4| 5| D2| 4|
| 5| 8| 4| D3| 4|
| 3| 7| 8| D1| 3|
+---+---+---+---+------+

Related

Giving Same Number If Same Group

I have dataframe like this
name status
+----+------+
|name|value |
+----+------+
| x | down|
| y |normal|
| z | down|
| x |normal|
| y | down|
+----+------+
If the names are same i want to put number 1,2,3 like this, new column must be like this
+----+------+------+
|name|value |newCol|
+----+------+------+
| x|down | 1|
| y|normal| 2|
| z|down | 3|
| x|normal| 1|
| y|down | 2|
+----+------+------+
win = Window.partitionBy("name").orderBy("name")
print("value")
dp_df_classification_agg_join = dp_df_classification_agg_join.withColumn("newCol",count("name").over(win))
First, replace the count("name") function with the dense_rank() function.
Then, replace this win = Window.partitionBy("name").orderBy("name") with win = Window.partitionBy().orderBy("name")

Adding column with rolling latest prior in PySpark

I have a pyspark dataframe with a list of customers, days, and transaction types.
+----------+-----+------+
| Customer | Day | Type |
+----------+-----+------+
| A | 2 | X11 |
| A | 4 | X2 |
| A | 9 | Y4 |
| A | 11 | X1 |
| B | 3 | Y4 |
| B | 7 | X1 |
+----------+-----+------+
I'd like to create a column that has "most recent X type" for each customer, like so:
+----------+-----+------+-------------+
| Customer | Day | Type | MostRecentX |
+----------+-----+------+-------------+
| A | 2 | X11 | X11 |
| A | 4 | X2 | X2 |
| A | 9 | Y4 | X2 |
| A | 11 | X1 | X1 |
| B | 3 | Y4 | - |
| B | 7 | X1 | X1 |
+----------+-----+------+-------------+
So for the X types it just takes the one from the current row, but for the Y type it takes the type from the most recent X row for that member (and if there isn't one, it gets a blank or something). I imagine I need a sort of window function but not very familiar with PySpark.
You can achieve by this taking the last column that startswith the letter "X" over a Window that partitions by the Customer and orders by the Day. Specify the Window to start at the beginning of the partition and stop at the current row.
from pyspark.sql import Window
from pyspark.sql.functions import col, last, when
w = Window.partitionBy("Customer").orderBy("Day").rowsBetween(Window.unboundedPreceding, 0)
df = df.withColumn(
"MostRecentX",
last(when(col("Type").startswith("X"), col("Type")), ignorenulls=True).over(w)
)
df.show()
#+--------+---+----+-----------+
#|Customer|Day|Type|MostRecentX|
#+--------+---+----+-----------+
#| A| 2| X11| X11|
#| A| 4| X2| X2|
#| A| 9| Y4| X2|
#| A| 11| X1| X1|
#| B| 3| Y4| null|
#| B| 7| X1| X1|
#+--------+---+----+-----------+
The trick here is to use when to return the Type column only if it starts with "X". By default, when will return null. Then we can use last with ignorenulls=True to get the value for MostRecentX.
If you want to replace the null with "-" as shown in your question, just call fillna on the MostRecentX column:
df.fillna("-", subset=["MostRecentX"]).show()
#+--------+---+----+-----------+
#|Customer|Day|Type|MostRecentX|
#+--------+---+----+-----------+
#| A| 2| X11| X11|
#| A| 4| X2| X2|
#| A| 9| Y4| X2|
#| A| 11| X1| X1|
#| B| 3| Y4| -|
#| B| 7| X1| X1|
#+--------+---+----+-----------+

Find column names of interconnected row values - Spark

I have a Spark dataframe that adheres to the following structure:
+------+-----------+-----------+-----------+------+
|ID | Name1 | Name2 | Name3 | Y |
+------+-----------+-----------+-----------+------+
| 1 | A,1 | B,1 | C,4 | B |
| 2 | D,2 | E,2 | F,8 | D |
| 3 | G,5 | H,2 | I,3 | H |
+------+-----------+-----------+-----------+------+
For every row I want to find in which column the value of Y is denoted as the first element. So, ideally I want to retrieve a list like: [Name2,Name1,Name2].
I am not sure how and whether it works to convert first to a RDD, then use a map function and convert the result back to DataFrame.
Any ideas are welcome.
You can probably try this piece of code :
df.show()
+---+-----+-----+-----+---+
| ID|Name1|Name2|Name3| Y|
+---+-----+-----+-----+---+
| 1| A,1| B,1| C,4| B|
| 2| D,2| E,2| F,8| D|
| 3| G,5| H,2| I,3| H|
+---+-----+-----+-----+---+
from pyspark.sql import functions as F
name_cols = ["Name1", "Name2", "Name3"]
cond = F
for col in name_cols:
cond = cond.when(F.split(F.col(col),',').getItem(0) == F.col("Y"), col)
df.withColumn("whichName", cond).show()
+---+-----+-----+-----+---+---------+
| ID|Name1|Name2|Name3| Y|whichName|
+---+-----+-----+-----+---+---------+
| 1| A,1| B,1| C,4| B| Name2|
| 2| D,2| E,2| F,8| D| Name1|
| 3| G,5| H,2| I,3| H| Name2|
+---+-----+-----+-----+---+---------+

How to replace row values in pyspark?

I have a column which name is id and data is like this
+----+
| id |
+----+
| 1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 1 |
| 2 |
| 3 |
| 4 |
+----+
I want to replace first 6 row 1,2,3,4,5,6 by emp and second 1,2,3,4 by std: it is possible?
I tried using replace but I didn't get the desired answer
For pyspark you can use something like below;
>>> from pyspark.sql import Row
>>> import pyspark.sql.functions as F
>>>
>>> df = sc.parallelize([1,2,3,4,5,6,1,2,3,4]).map(lambda x: Row(x)).toDF(['col'])
>>> df.show()
+---+
|col|
+---+
| 1|
| 2|
| 3|
| 4|
| 5|
| 6|
| 1|
| 2|
| 3|
| 4|
+---+
>>> from pyspark.sql.window import Window
>>> df = df.withColumn("id", F.row_number().over(Window.orderBy(F.lit('A'))))
>>> df = df.withColumn('col', F.when(df.id < 7, 'emp').when(df.id >= 7, 'std')).select('col')
>>> df.show()
+---+
|col|
+---+
|emp|
|emp|
|emp|
|emp|
|emp|
|emp|
|std|
|std|
|std|
|std|
+---+
This works:
import pandas as pd
df=pd.DataFrame({'id':[1,2,3,4,5,6,7,8,9,10],'data':[1,2,3,4,5,6,1,2,3,4]})
df.loc[:6,'New_COl'] = "emp"
df.loc[6:,'New_COl'] = "std"

Transpose mutiple columns in a Pyspark dataframe with a condition

I have a spark dataframe that looks like this.
id cd1 version1 dt1 cd2 version2 dt2 cd3 version3 dt3
1 100 1 20100101 101 1 20100101 102 20100301
1 101 1 20100102 102 20100201 100 1 20100302
2 201 1 20100103 100 1 20100301 100 1 20100303
2 202 2 20100104 100 1 20100105
I need to transpose all the codes into a single column with the following conditions
If the corresponding version code is 1, add a decimal point after the first digit
Each patient should have distinct codes
For the above example, the output should look like this.
id code dt
1 1.00 20100101
1 1.01 20100101
1 102 20100301
1 1.01 20100102
1 102 20100201
1 10.0 20100302
2 2.01 20100103
2 1.00 20100301
2 1.00 20100303
2 202 20100104
2 10.0 20100105
I am using Pyspark to do this. In the above example, I have shown only 3 codes with their corresponding version columns but I have 30 such columns. Also, this data has around 25 million rows.
Any ideas on how to accomplish this will be extremely helpful.
You can explode a list of these columns so that there is only one (cd, version) pair per line
First, let's create the dataframe:
df = sc.parallelize([[1,100,1,101,1,102,None],[1,101,1,102,None,100,1],[2,201,1,100,1,100,1],
[2,202,2,100,1,None,None]]).toDF(["id","cd1","version1","cd2","version2","cd3","version3"])
Using posexplode:
import pyspark.sql.functions as psf
from itertools import chain
nb_versions = 4
df = df.na.fill(-1).select(
"id",
psf.posexplode(psf.create_map(list(chain(*[(psf.col("cd" + str(i)), psf.col("version"+str(i))) for i in range(1, nb_versions)])))).alias("pos", "cd", "version")
).drop("pos").filter("cd != -1")
+---+---+-------+
| id| cd|version|
+---+---+-------+
| 1|100| 1|
| 1|101| 1|
| 1|102| -1|
| 1|101| 1|
| 1|102| -1|
| 1|100| 1|
| 2|201| 1|
| 2|100| 1|
| 2|100| 1|
| 2|202| 2|
| 2|100| 1|
+---+---+-------+
Using explode:
nb_versions = 4
df = df.select(
"id",
psf.explode(psf.array(
[psf.struct(
psf.col("cd" + str(i)).alias("cd"),
psf.col("version" + str(i)).alias("version")) for i in range(1, nb_versions)])).alias("temp"))\
.select("id", "temp.*")
+---+----+-------+
| id| cd|version|
+---+----+-------+
| 1| 100| 1|
| 1| 101| 1|
| 1| 102| null|
| 1| 101| 1|
| 1| 102| null|
| 1| 100| 1|
| 2| 201| 1|
| 2| 100| 1|
| 2| 100| 1|
| 2| 202| 2|
| 2| 100| 1|
| 2|null| null|
+---+----+-------+
Now we can implement your conditions
division by 100 for version==1
distinct values
We'll use functions when, otherwise for the condition and distinct:
df.withColumn("cd", psf.when(df.version == 1, df.cd/100).otherwise(df.cd))\
.distinct().drop("version")
+---+-----+
| id| cd|
+---+-----+
| 1| 1.0|
| 1| 1.01|
| 1|102.0|
| 2| 1.0|
| 2| 2.01|
| 2|202.0|
+---+-----+
This is how I did it. I am sure there is a better way to do this.
def process_code(raw_data):
for i in range(1,4):
cd_col_name = "cd" + str(i)
version_col_name = "version" + str(i)
raw_data = raw_data.withColumn("mod_cd" + str(i), when(raw_data[version_col_name] == 1, concat(substring(raw_data[cd_col_name],1,1),lit("."),substring(raw_data[cd_col_name],2,20))).otherwise(raw_data[cd_col_name]))
mod_cols = [col for col in raw_data.columns if 'mod_cd' in col]
nb_versions = 3
new = raw_data.fillna('9999', subset=mod_cols).select("id", psf.posexplode(psf.create_map(list(chain(*[(psf.col("mod_cd" + str(i)), psf.col("dt"+str(i))) for i in range(1, nb_versions)])))).alias("pos", "final_cd", "final_date")).drop("pos")
return new
test = process_code(df)
test = test.filter(test.final_cd != '9999')
test.show(100, False)

Categories

Resources