Unexplode in pyspark with sequence conditional - python

I need to unexplode a column in dataframe pyspark with sequence number conditional. E.g
Input dataframe
Expect output dataframe
You can see when c1 = 1 at a row, that row will break content of c4 column into new row (because length over limit). Otherwise if when c1 = 0 then c4 contain full content, no need break into new row. c4 column can break it into multi row next
This same pyspark.sql.functions.explode(col) in pyspark, and i need to unexplode but i have a conditional is c1 column (it's not simple such as group by then collect list df.groupby().agg(F.collect_list()), because c1 is sequence conditional)
I try to use window function flow by this topic PySpark - Append previous and next row to current row. But how can i solve when c4 col break multi row next
Sample code
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.getOrCreate()
df_in = spark_session.createDataFrame(
[
(1, 'a', 'b', 'c1', 'd'),
(0, 'a', 'b', 'c2', 'd'),
(0, 'e', 'f', 'g', 'h'),
(0, '1', '2', '3', '4'),
(1, 'x', 'y', 'z1', 'k'),
(1, 'x', 'y', 'z2', 'k'),
(1, 'x', 'y', 'z3', 'k'),
(0, 'x', 'y', 'z4', 'k'),
(1, '6', '7', '81', '9'),
(0, '6', '7', '82', '9'),
],
['c1', 'c2', 'c3', 'c4', 'c5']
)
df_out = spark_session.createDataFrame(
[
('a', 'b', 'c1-c2', 'd'),
('e', 'f', 'g', 'h'),
('1', '2', '3', '4'),
('x', 'y', 'z1-z2-z3-z4', 'k'),
('6', '7', '81-82', '9')
],
['c2', 'c3', 'c4', 'c5']
)
df_in.show()
df_out.show()
How can i solve that. Thank you
UPDATED
input
df_in = spark_session.createDataFrame(
[
('0', 1, 'a', 'b', 'c1', 'd'),
('0', 0, 'a', 'b', 'c2', 'd'),
('0', 0, 'e', 'f', 'g', 'h'),
('0', 0, '1', '2', '3', '4'),
('0', 1, 'x', 'y', 'sele', 'k'),
('0', 1, 'x', 'y', 'ct ', 'k'),
('0', 1, 'x', 'y', 'from', 'k'),
('0', 0, 'x', 'y', 'a', 'k'),
('0', 1, '6', '7', '81', '9'),
('0', 0, '6', '7', '82', '9'),
],
['c0', 'c1', 'c2', 'c3', 'c4', 'c5']
)
output
Expect output
x| y|select -from-a| k

This solution works even when your data set is in multiple partitions and not ordered.
from pyspark.sql.window import Window
from pyspark.sql import functions as F
orderByColumns = [F.col('c4'),F.col('c1').cast('int').desc()]
partitionColumns =[ F.col(column) for column in ['c2','c3','c5']]
df_in.orderBy(orderByColumns)\
.withColumn('ranked',F.dense_rank().over(Window.partitionBy(partitionColumns).orderBy(orderByColumns)))\
.withColumn('c4-ranked',F.concat(F.col('ranked'),F.lit('='),F.col('c4')))\
.groupBy(partitionColumns)\
.agg(F.collect_list('c4-ranked').alias('c4'))\
.select(
F.col('c2'),
F.col('c3'),
F.regexp_replace(F.array_join(F.col('c4'),"-"),"\d+=","").alias('c4'),
F.col('c5')
)\
.show()
+---+---+-----------+---+
| c2| c3| c4| c5|
+---+---+-----------+---+
| 1| 2| 3| 4|
| x| y|z1-z2-z3-z4| k|
| e| f| g| h|
| 6| 7| 81-82| 9|
| a| b| c1-c2| d|
+---+---+-----------+---+
Setup
df_in = sparkSession.createDataFrame(
[
(1, 'a', 'b', 'c1', 'd'),
(0, 'a', 'b', 'c2', 'd'),
(0, 'e', 'f', 'g', 'h'),
(0, '1', '2', '3', '4'),
(1, 'x', 'y', 'z1', 'k'),
(1, 'x', 'y', 'z2', 'k'),
(1, 'x', 'y', 'z3', 'k'),
(0, 'x', 'y', 'z4', 'k'),
(1, '6', '7', '81', '9'),
(0, '6', '7', '82', '9'),
],
['c1', 'c2', 'c3', 'c4', 'c5']
).repartition(5)
df_in.show()
Provides on my run (may very each run)
+---+---+---+---+---+
| c1| c2| c3| c4| c5|
+---+---+---+---+---+
| 1| x| y| z2| k|
| 0| x| y| z4| k|
| 1| a| b| c1| d|
| 0| 1| 2| 3| 4|
| 0| 6| 7| 82| 9|
| 0| a| b| c2| d|
| 0| e| f| g| h|
| 1| 6| 7| 81| 9|
| 1| x| y| z3| k|
| 1| x| y| z1| k|
+---+---+---+---+---+

Related

How to map a column in PySpark DataFrame and avoid getting Null values?

I have a PySpark DataFrame and I want to map values of a column.
Sample dataset:
data = [(1, 'N'), \
(2, 'N'), \
(3, 'C'), \
(4, 'S'), \
(5, 'North'), \
(6, 'Central'), \
(7, 'Central'), \
(8, 'South')
]
columns = ["ID", "City"]
df = spark.createDataFrame(data = data, schema = columns)
The mapping dictionary is:
{'N': 'North', 'C': 'Central', 'S': 'South'}
And I use the following code:
from pyspark.sql import functions as F
from itertools import chain
mapping_dict = {'N': 'North', 'C': 'Central', 'S': 'South'}
mapping_expr = F.create_map([F.lit(x) for x in chain(*mapping_dict.items())])
df_new = df.withColumn('City_New', mapping_expr[df['City']])
And the results are:
As you can see, I get Null values for rows which I don't include their values in the mapping dictionary. To solve this, I can define mapping dictionary by:
{'N': 'North', 'C': 'Central', 'S': 'South', \
'North': 'North', 'Central': 'Central', 'South': 'South'}
However, if there are many unique values in the dataset, it is hard to define a mapping dictionary.
Is there any better way for this purpose?
you can use a coalesce.
here's how it'd look like.
# create separate case whens for each key-value pair
map_whens = [func.when(func.upper('city') == k.upper(), v) for k, v in map_dict.items()]
# [Column<'CASE WHEN (upper(city) = N) THEN North END'>,
# Column<'CASE WHEN (upper(city) = C) THEN Central END'>,
# Column<'CASE WHEN (upper(city) = S) THEN South END'>]
# pass case whens to coalesce with last value as `city` field
data_sdf. \
withColumn('city_new', func.coalesce(*map_whens, 'city')). \
show()
# +---+-------+--------+
# | id| city|city_new|
# +---+-------+--------+
# | 1| N| North|
# | 2| N| North|
# | 3| C| Central|
# | 4| S| South|
# | 5| North| North|
# | 6|Central| Central|
# | 7|Central| Central|
# | 8| South| South|
# +---+-------+--------+

Compare two couple of columns from two different pyspark dataframe to display the data that are different

i've got this dataframe with four columns
df1 = spark.createDataFrame([
('c', 'd', 3.0, 4),
('c', 'd', 7.3, 8),
('c', 'd', 7.3, 2),
('c', 'd', 7.3, 8),
('e', 'f', 6.0, 3),
('e', 'f', 6.0, 8),
('e', 'f', 6.0, 3),
('c', 'j', 4.2, 3),
('c', 'j', 4.3, 9),
], ['a', 'b', 'c', 'd'])
df1.show()
+---+---+---+---+
| a| b| c| d|
+---+---+---+---+
| c| d|3.0| 4|
| c| d|7.3| 8|
| c| d|7.3| 2|
| c| d|7.3| 8|
| e| f|6.0| 3|
| e| f|6.0| 8|
| e| f|6.0| 3|
| c| j|4.2| 3|
| c| j|4.3| 9|
+---+---+---+---+
and i also got this other dataframe df2 with the same schema as the dataframe df1
df2 = spark.createDataFrame([
('c', 'd', 3.0, 4),
('c', 'd', 3.3, 5),
('c', 'd', 7.3, 2),
('c', 'd', 7.3, 7),
('e', 'f', 6.0, 3),
('c', 'j', 4.2, 1),
('c', 'j', 4.3, 9),
], ['a', 'b', 'c', 'd'])
df2.show()
+---+---+---+---+
| a| b| c| d|
+---+---+---+---+
| c| d|3.0| 4|
| c| d|3.3| 5|
| c| d|7.3| 2|
| c| d|7.3| 7|
| e| f|6.0| 3|
| c| j|4.2| 1|
| c| j|4.3| 9|
+---+---+---+---+
I want to compare the couple (a, b, d) so that i can obtain the different values that are present in df2 but not in df1 like this
df3
+---+---+---+---+
| a| b| c| d|
+---+---+---+---+
| c| d|3.3| 5|
| c| d|7.3| 7|
| c| j|4.2| 1|
+---+---+---+---+
I think what you want is:
df2.subtract(df1.intersect(df2)).show()
I want what is in df2 that is not in both df1 and df2.
+---+---+---+---+
| a| b| c| d|
+---+---+---+---+
| c| j|4.2| 1|
| c| d|3.3| 5|
| c| d|7.3| 7|
+---+---+---+---+
I also agree with #pltc that call out you might have made a mistake in your output table.

Compute proportion of values within groups

I'm trying to calculate the proportion of a specific value occurring in a specific column within subgroups.
Sample dataframe
pdf = pd.DataFrame({
'id': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],
'letter': ['L', 'A', 'L', 'L', 'L', 'L', 'L', 'A', 'L', 'L']
})
df = spark.createDataFrame(pdf)
df.show()
I tried to rely on this answer but with the following code
df\
.groupby('id')\
.agg((count(col('letter') == 'L') / count(col('letter'))).alias('prop'))\
.show()
I obtained a column full of 1.0, even when I changed 'L' to 'A'.
My desired output is, for each group, the proportion of 'L' values within the group:
+---+--------+
| id| prop|
+---+--------+
| 1| 0.75|
| 2| 1.0|
| 3| 0.66667|
+---+--------+
You can use sum with when instead to count the occurrences of L:
df.groupby('id')\
.agg((F.sum(F.when(F.col('letter') == 'L', 1)) / F.count(F.col('letter'))).alias('prop'))\
.show()
This will give you the proportion only in non-null values. If you want to calculate on all rows, divide by count("*")instead of count(col('letter')).
Before you count, you need to mask the non-L letters with nulls using when:
df\
.groupby('id')\
.agg((count(when(col('letter') == 'L', 1)) / count(col('letter'))).alias('prop'))\
.show()
Note that count will only count non-null entries. It does not only count true entries, as you had expected in your code. Your code is more suitable if you're using count_if from Spark SQL.

Generate all string combinations from given character list in python

I would like to generate all possible combinations of a given character list with a given length and exclude a few combinations. For example, if I have this list:
chars = ['a', 'b', 'c', '1', '2']
Now I want to exclude character formations of more than 2characters in a row so that combinations like aaaaa or 111111 aren't possible. And I also want the output to be a given length, for example, 5 characters. Is this possible? I thought of itertools
Thanks for any help in advance.
import itertools
chars = ['a', 'b', 'c', '1', '2']
for combination in itertools.product(chars, repeat = 5):
if all(combination.count(x) < 3 for x in combination):
print (combination)
Output:
('c', '1', '1', '2', 'c')
('c', '1', '1', '2', '2')
('c', '1', '2', 'a', 'a')
('c', '1', '2', 'a', 'b')
('c', '1', '2', 'a', 'c')
('c', '1', '2', 'a', '1')
('c', '1', '2', 'a', '2')
('c', '1', '2', 'b', 'a')
('c', '1', '2', 'b', 'b')
('c', '1', '2', 'b', 'c')
('c', '1', '2', 'b', '1')
('c', '1', '2', 'b', '2')
('c', '1', '2', 'c', 'a')
('c', '1', '2', 'c', 'b')
('c', '1', '2', 'c', '1')
('c', '1', '2', 'c', '2')
('c', '1', '2', '1', 'a')
('c', '1', '2', '1', 'b')
('c', '1', '2', '1', 'c')
('c', '1', '2', '1', '2')
('c', '1', '2', '2', 'a')
('c', '1', '2', '2', 'b')
('c', '1', '2', '2', 'c')
('c', '1', '2', '2', '1')
('c', '2', 'a', 'a', 'b')
('c', '2', 'a', 'a', 'c')
('c', '2', 'a', 'a', '1')
('c', '2', 'a', 'a', '2')
('c', '2', 'a', 'b', 'a')
('c', '2', 'a', 'b', 'b')
('c', '2', 'a', 'b', 'c')
('c', '2', 'a', 'b', '1')
('c', '2', 'a', 'b', '2')
('c', '2', 'a', 'c', 'a')
('c', '2', 'a', 'c', 'b')
('c', '2', 'a', 'c', '1')
('c', '2', 'a', 'c', '2')
('c', '2', 'a', '1', 'a')
('c', '2', 'a', '1', 'b')
('c', '2', 'a', '1', 'c')
('c', '2', 'a', '1', '1')
('c', '2', 'a', '1', '2')
('c', '2', 'a', '2', 'a')
('c', '2', 'a', '2', 'b')
('c', '2', 'a', '2', 'c')
('c', '2', 'a', '2', '1')
('c', '2', 'b', 'a', 'a')
('c', '2', 'b', 'a', 'b')
('c', '2', 'b', 'a', 'c')
('c', '2', 'b', 'a', '1')
('c', '2', 'b', 'a', '2')
('c', '2', 'b', 'b', 'a')
('c', '2', 'b', 'b', 'c')
('c', '2', 'b', 'b', '1')
('c', '2', 'b', 'b', '2')
('c', '2', 'b', 'c', 'a')
('c', '2', 'b', 'c', 'b')
('c', '2', 'b', 'c', '1')
('c', '2', 'b', 'c', '2')
('c', '2', 'b', '1', 'a')
('c', '2', 'b', '1', 'b')
('c', '2', 'b', '1', 'c')
('c', '2', 'b', '1', '1')
('c', '2', 'b', '1', '2')
('c', '2', 'b', '2', 'a')
('c', '2', 'b', '2', 'b')
('c', '2', 'b', '2', 'c')
('c', '2', 'b', '2', '1')
('c', '2', 'c', 'a', 'a')
('c', '2', 'c', 'a', 'b')
('c', '2', 'c', 'a', '1')
('c', '2', 'c', 'a', '2')
('c', '2', 'c', 'b', 'a')
('c', '2', 'c', 'b', 'b')
('c', '2', 'c', 'b', '1')
('c', '2', 'c', 'b', '2')
etc...

Removing 0 at the end of multiple time series

I have multiple time series stored in a Spark DataFrame as below:
df = spark.createDataFrame([('2020-03-10', 'France', 19),
('2020-03-11', 'France', 22),
('2020-03-12', 'France', 0),
('2020-03-13', 'France', 0),
('2020-03-14', 'France', 0),
('2020-04-10', 'UK', 12),
('2020-04-11', 'UK', 0),
('2020-04-12', 'UK', 9),
('2020-04-13', 'UK', 0),
('2020-04-08', 'Japan', 0),
('2020-04-09', 'Japan', -3),
('2020-04-10', 'Japan', -2)
],
['date', 'country', 'y']
)
I am looking for a way (without looping as my real DataFrame has millions of rows) to remove the 0's at the end of each time series.
In our example, we would obtain:
df = spark.createDataFrame([('2020-03-10', 'France', 19),
('2020-03-11', 'France', 22),
('2020-04-10', 'UK', 12),
('2020-04-11', 'UK', 0),
('2020-04-12', 'UK', 9),
('2020-04-08', 'Japan', 0),
('2020-04-09', 'Japan', -3),
('2020-04-10', 'Japan', -2)
],
['date', 'country', 'y']
)
Assume you want to remove at the end of every country ordered by date
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import Window
df = spark.createDataFrame([('2020-03-10', 'France', 19),
('2020-03-11', 'France', 22),
('2020-03-12', 'France', 0),
('2020-03-13', 'France', 0),
('2020-03-14', 'France', 0),
('2020-04-10', 'UK', 12),
('2020-04-11', 'UK', 0),
('2020-04-12', 'UK', 9),
('2020-04-13', 'UK', 0),
('2020-04-13', 'India', 1),
('2020-04-14', 'India', 0),
('2020-04-15', 'India', 0),
('2020-04-16', 'India', 1),
('2020-04-08', 'Japan', 0),
('2020-04-09', 'Japan', -3),
('2020-04-10', 'Japan', -2)
],
['date', 'country', 'y']
)
# convert negative to positive to avoid accidental summing up to 0
df=df.withColumn('y1',F.abs(F.col('y')))
# Window function to reverse the last rows to first
w=Window.partitionBy('country').orderBy(F.col('date').desc())
# Start summing function. when the first non zero value comes the value changes
df_sum = df.withColumn("sum_chk",F.sum('y1').over(w))
# Filter non zero values, sort it just for viewing
df_res = df_sum.where("sum_chk!=0").orderBy('date',ascending=True)
The result:
df_res.show()
+----------+-------+---+---+-------+
| date|country| y| y1|sum_chk|
+----------+-------+---+---+-------+
|2020-03-10| France| 19| 19| 41|
|2020-03-11| France| 22| 22| 22|
|2020-04-08| Japan| 0| 0| 5|
|2020-04-09| Japan| -3| 3| 5|
|2020-04-10| Japan| -2| 2| 2|
|2020-04-10| UK| 12| 12| 21|
|2020-04-11| UK| 0| 0| 9|
|2020-04-12| UK| 9| 9| 9|
|2020-04-13| India| 1| 1| 2|
|2020-04-14| India| 0| 0| 1|
|2020-04-15| India| 0| 0| 1|
|2020-04-16| India| 1| 1| 1|
+----------+-------+---+---+-------+

Categories

Resources