Use pyspark countDistinct by another column with already grouped dataframe - python

I have a pyspark dataframe that looks like this:
key key2 category ip_address
1 a desktop 111
1 a desktop 222
1 b desktop 333
1 c mobile 444
2 d cell 555
And I want to groupBy key to get the total number of unique ip_addr, along with the total number of unique key_2, and then the number of unique ip_address that was contributed by each category (assume the values in category are constant, so the values of category can only be [desktop, mobile, cell]).
So, I'm looking for a resulting dataframe like this:
key num_ips num_key2 num_desktop num_mobile num_cell
1 4 3 3 1 0
2 1 1 0 0 0
I've been trying code like this, but the code for the num_desktop, num_mobile, num_cell isn't quite right.
import pyspark.sql.functions as F
df_agg = df.groupBy('key1') \
.agg(F.countDistinct('ip_addr').alias('num_ips'), \
F.countDistinct('key_2').alias('num_key2'), \
F.countDistinct('ip_addr').where(F.col('category')=='desktop').alias('num_desktop'), \
F.countDistinct('ip_addr').where(F.col('category')=='mobile').alias('num_mobile'), \
F.countDistinct('ip_addr').where(F.col('category')=='cell').alias('num_cell')))
Do I have to do some type of nested groupBy, or maybe a Window function? Any help is greatly appreciated!

I had to split the dataframe and join them back for the desktop, mobile, and cell counts
df1 = df.groupBy('key') \
.agg(F.countDistinct('ip_address').alias('num_ips'), \
F.countDistinct('key2').alias('num_key2'))
de = df.filter(col("category")=="desktop").groupBy('key')\
.agg(F.countDistinct('ip_address').alias('num_desktop')).withColumnRenamed("key", "key1")
dm = df.filter(col("category")=="mobile").groupBy('key')\
.agg(F.countDistinct('ip_address').alias('num_mobile')).withColumnRenamed("key", "key1")
dc = df.filter(col("category")=="cell").groupBy('key')\
.agg(F.countDistinct('ip_address').alias('num_cell')).withColumnRenamed("key", "key1")
join_df = df1.join(de, (df.key == de.key1), "left").drop("key1")\
.join(dm, (df.key == dm.key1), "left").drop("key1")\
.join(dc, (df.key == dc.key1), "left").drop("key1")\
.fillna(0).drop('category', 'ip_address')
Output:
+---+-------+--------+-----------+----------+--------+
|key|num_ips|num_key2|num_desktop|num_mobile|num_cell|
+---+-------+--------+-----------+----------+--------+
| 1| 4| 3| 3| 1| 0|
| 2| 1| 1| 0| 0| 1|
+---+-------+--------+-----------+----------+--------+

Related

Pyspark DataFrame Grouping by item that doesn't belong to the group

I am new to pyspark and am stuck in a situation could you please help me in obtaining a result in a manner as:
customer_id
item_id
amount
1
tx1
15
1
tx2
10
1
tx3
14
2
tx1
15
2
tx4
12
3
tx2
10
2
tx6
43
4
tx4
12
5
tx8
76
6
tx6
43
5
tx6
43
3
tx6
43
And want to know for each item:
The count of customers that didn't purchase this item
The sum of the amount of items that are not the customers of this item.
So the final table would look like:
item_id
target_cust
taget_amount
tx1
4
227
tx2
4
201
tx3
5
297
tx4
4
--
tx6
3
--
tx8
5
--
please help me in getting a similar output, any suggestions in the direction would be great
First group by customer_id and get list of purchased item_id with the associated amount like this:
import pyspark.sql.functions as F
items_by_customer_df = df.groupBy("customer_id").agg(
F.collect_set("item_id").alias("items"),
F.sum("amount").alias("target_amount")
)
items_by_customer_df.show()
#+-----------+---------------+-------------+
#|customer_id|items |target_amount|
#+-----------+---------------+-------------+
#|1 |[tx1, tx2, tx3]|39 |
#|2 |[tx1, tx6, tx4]|70 |
#|3 |[tx2, tx6] |53 |
#|5 |[tx6, tx8] |119 |
#|4 |[tx4] |12 |
#|6 |[tx6] |43 |
#+-----------+---------------+-------------+
Now, join this grouped dataframe with distinct item_ids from original df using negation of array_contains as condition, then group by item_id and do aggregations count(customer_id) + sum(amount):
result = df.select("item_id").distinct().join(
items_by_customer_df,
~F.array_contains("items", F.col("item_id"))
).groupBy("item_id").agg(
F.count("customer_id").alias("target_cust"),
F.sum("target_amount").alias("target_amount")
)
result.show()
#+-------+-----------+-------------+
#|item_id|target_cust|target_amount|
#+-------+-----------+-------------+
#| tx2| 4| 244|
#| tx4| 4| 254|
#| tx1| 4| 227|
#| tx8| 5| 217|
#| tx3| 5| 297|
#| tx6| 2| 51|
#+-------+-----------+-------------+

PySpark - Add incrementing integer rank value based on descending order from another column value

I got a pyspark dataframe that looks like:
id
score
1
0.5
1
2.5
2
4.45
3
8.5
3
3.25
3
5.55
And I want to create a new column rank based on the value of the score column in incrementing order meaning the highest value will have the rank 0 and restarting the count by the id column.
Something like this:
id
value
rank
1
2.5
0
1
0.5
1
2
4.45
0
3
8.5
0
3
5.55
1
3
3.25
2
Thanks in advance!
You can use pyspark.sql.functions.dense_rank which returns the rank of rows within a window partition.
Note that for this to work exactly we have to add an orderBy as dense_rank() requires window to be ordered. Finally let's subtract -1 on the outcome (as the default starts from 1)
from pyspark.sql.functions import *
df = df.withColumn(
"rank", dense_rank().over(Window.partitionBy("id").orderBy(desc("score"))) - 1)
>>> df.show()
+---+-----+----+
| id|score|rank|
+---+-----+----+
| 1| 2.5| 0|
| 1| 0.5| 1|
| 2| 4.45| 0|
| 3| 8.5| 0|
| 3| 5.55| 1|
| 3| 3.25| 2|
+---+-----+----+
SQL syntax:
SELECT dense_rank()(value_expr) OVER (PARTITION BY window_partition ORDER BY window_ordering) from table

Get distinct row count in pyspark

I have below data frame in pyspark. I want to check for every row if it is unique value in a data frame.
Below is the dataframe .
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-03-01"],["4",""],["5","2021-21-01"],["6","1900-01-01"],["6","2000-01-01"]]
df=spark.createDataFrame(data,["id","input"])
df.show()
id
input
1
2020-02-01
2
2020-02-01
3
2019-03-01
4
5
2021-21-01
6
1900-01-01
6
2000-01-01
I am looking to get the count of rows to find if the row is unique or not. Below is the output what I am looking for.
id
CountUnique
1
1
2
1
3
1
4
1
5
1
6
2
6
2
The below code will give me the count by grouping howerver i need to show count for every row. For example 6 should show two times with 2 row count.
df.groupBy("id").count().orderBy("id").show().select("id")
You can count with window functions, i.e. count(*) over (partition by id):
df.withColumn('count', F.expr('count(*) over (partition by id)')).show()
+---+----------+-----+
| id| input|count|
+---+----------+-----+
| 3|2021-03-01| 1|
| 5|2021-21-01| 1|
| 6|1900-01-01| 2|
| 6|2000-01-01| 2|
| 1|2020-02-01| 1|
| 4| | 1|
| 2|2019-03-01| 1|
+---+----------+-----+

New pandas boolean column based on conditions for each unique value

I have a dataframe with the following columns: ID, event_name, event_date
Goal: For every unique ID, if they have an event_name == 'attended book event' then I want to create a new column attended_book_event and have the value = 1. If they do not have and event_name==' attended book event' then the value in the new column is 0.
Sample:
ID| event_name | event_date
1| joined_club| 12-12-03
1| attended_book_event| 12-27-03
1| elite_member| 03-01-05
2| joined_club| 12-12-03
2| elite_member| 03-01-05
I tried to groupby the id and then create a new column with the condition but the results were not what I was looking for.
df_dose['had_dose_increase'] = [1 if df_dose['event_name'] ==
'dose_increased' else 0]
I want a new column
ID| event_name | event_date| attended_book_event
1| joined_club| 12-12-03| 1
1| attended_book_event| 12-27-03|1
1| elite_member| 03-01-05|1
2| joined_club| 12-12-03|0
2| elite_member| 03-01-05|0
Using pd.Series.groupby with transform:
df['attended_book_event'] = df.groupby('ID')['event_name'].transform(lambda x: 'attended_book_event' in set(x)).astype(int)
Output:
ID event_name event_date attended_book_event
0 1 joined_club 12-12-03 1
1 1 attended_book_event 12-27-03 1
2 1 elite_member 03-01-05 1
3 2 joined_club 12-12-03 0
4 2 elite_member 03-01-05 0

Creating each row for each item for a user in spark dataframe

I have a spark data frame like below:
User Item Purchased
1 A 1
1 B 2
2 A 3
2 C 4
3 A 3
3 B 2
3 D 6
only showing top 5 rows
each user has a row for an item they have purchased. Assume Purhcased to be how many qty. purchased (count).
However there are items which a user might not have purchased so for that item that particular user doesn't have a row. We only have rows for items which a user has purchased. So if user 1 has purchased item A, B, we have 2 rows for user 1 corresponding to these two items. But if user 2 has purchased A, C then user 2 has rows for item A and C but no B. I want in the end each user should have all rows for all items in the table with the corresponding count of each.
I want to convert this data frame into a data frame as above but also having rows for items which a user has not seen and give the corresponding count as zero.
Like below:
User Item Purchased
1 A 1
1 B 2
1 C 0
1 D 0
2 A 3
2 C 4
2 B 0
2 D 0
3 A 3
3 B 2
3 D 6
3 C 0
only showing top 5 rows
One way I thought was that in spark if I use cross_tab method of sqlContext on the first data frame then I can convert each row to column with corresponding values. For item which user doesn't have it will create a column for the same and put zero there.
But then how to convert back those columns to rows?. It might also be a roundabout way.
Thanks
We can achieve this by just using only df functions as well.
orders = [(1,"A",1),(1,"B",2),(2,"A",3),(2,"C",4),(3,"A",3),(3,"B",2),(3,"D",6)]
df = sqlContext.createDataFrame(orders, ["user","item","purchased"])
df_items = df.select("item").distinct().repartition(5).withColumnRenamed("item", "item_1")
df_users = df.select("user").distinct().repartition(5).withColumnRenamed("user", "user_1")
df_cartesian = df_users.join(df_items)
//above expression returns cartesian product of users and items dfs
joined_df = df_cartesian.join(df, [df_cartesian.user_1==df.user, df_cartesian.item_1==df.item], "left_outer").drop("user").drop("item")
result_df = joined_df.fillna(0,["purchased"]).withColumnRenamed("item_1", "item").withColumnRenamed("user_1", "user")
Finally, result_df.show() produces desire output shown below:
+----+----+---------+
|user|item|purchased|
+----+----+---------+
| 2| A| 3|
| 2| B| 0|
| 2| C| 4|
| 2| D| 0|
| 3| A| 3|
| 3| B| 2|
| 3| C| 0|
| 3| D| 6|
| 1| A| 1|
| 1| B| 2|
| 1| C| 0|
| 1| D| 0|
+----+----+---------+
df = sqlContext.createDataFrame([(1, 'A', 2), (1, 'B', 3), (2, 'A', 2)], ['user', 'item', 'purchased'])
pivot = df.groupBy('user').pivot('item').sum('purchased').fillna(0)
items = [i['item'] for i in df.select('item').distinct().collect()]
flattened_rdd = pivot.rdd.flatMap(lambda x: [(x['user'], i, x[i]) for i in items])
sqlContext.createDataFrame(flattened_rdd, ["user", "item", "purchased"]).show()

Categories

Resources