How to generate date values into a dataframe in PySpark? - python

May I know what's wrong with below code? it does not print anything.
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, current_timestamp,last_day,next_day, date_format, date_add, year, month, dayofmonth, dayofyear, dayofweek, date_trunc, date_sub, to_date, add_months, weekofyear, quarter, col
from pyspark.sql.types import StructType,StructField,StringType, IntegerType
ss = SparkSession.builder.appName('DateDim').master('local[1]').getOrCreate()
df = ss.createDataFrame([],StructType([]))
current_date()
df = df.select(current_date().alias("current_date"),next_day(current_date(), 'sunday').alias("next_day"),dayofweek(current_date()).alias("day_of_week"),dayofmonth(current_date()).alias("day_of_month"),dayofyear(current_date()).alias("day_of_year"),last_day(current_date()).alias("last_day"),year(current_date()).alias("year"),month(current_date()).alias("month"), weekofyear(current_date()).alias("week_of_year"),quarter(current_date()).alias("quarter")).collect()
print(df)
for i in range(1, 1000):
print(i)
for i in range(1, 1000):
v_date = date_add(v_date, i)
df.unionAll(df.select(v_date.alias("current_date"),next_day(v_date,'sunday').alias("next_day"),dayofweek(v_date).alias("day_of_week"),dayofmonth(v_date).alias("day_of_month"),dayofyear(v_date).alias("day_of_year"),last_day(v_date).alias("last_day"),year(v_date).alias("year"),month(v_date).alias("month"), weekofyear(v_date).alias("week_of_year"),quarter(v_date).alias("quarter")))
df.show()

You're getting zero rows because there are no rows in the initial df. Any columns being created will have no values as there are no rows in df.
It seems you're trying to create a dataframe with 1000 dates starting from the current day. There's a simple approach using sequence function.
data_sdf = spark.createDataFrame([(1,)], 'id string')
data_sdf. \
withColumn('min_dt', func.current_date().cast('date')). \
withColumn('max_dt', func.date_add('min_dt', 1000).cast('date')). \
withColumn('all_dates', func.expr('sequence(min_dt, max_dt, interval 1 day)')). \
withColumn('dates_exp', func.explode('all_dates')). \
drop('id'). \
show(10)
# +----------+----------+--------------------+----------+
# | min_dt| max_dt| all_dates| dates_exp|
# +----------+----------+--------------------+----------+
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-08-27|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-08-28|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-08-29|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-08-30|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-08-31|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-09-01|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-09-02|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-09-03|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-09-04|
# |2022-08-27|2025-05-23|[2022-08-27, 2022...|2022-09-05|
# +----------+----------+--------------------+----------+
# only showing top 10 rows
select the dates_exp field for further use.

You want to use the range() function which generates rows (using sequence you will generate an array which you then need to explode into rows).
That's how you can use it:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, next_day, dayofweek, dayofmonth, dayofyear, last_day, year, month, \
weekofyear, quarter, current_date
spark = SparkSession.builder.getOrCreate()
(
spark
.range(0, 1000)
.alias("id")
.select(
(current_date() + col('id').cast("int")).alias("date")
)
.select(
"date",
next_day("date", 'sunday').alias("next_sunday"),
dayofweek("date").alias("day_of_week"),
dayofmonth("date").alias("day_of_month"),
dayofyear("date").alias("day_of_year"),
last_day("date").alias("last_day"),
year("date").alias("year"),
month("date").alias("month"),
weekofyear("date").alias("week_of_year"),
quarter("date").alias("quarter")
)
).show()
it returns
+----------+-----------+-----------+------------+-----------+----------+----+-----+------------+-------+
| date|next_sunday|day_of_week|day_of_month|day_of_year| last_day|year|month|week_of_year|quarter|
+----------+-----------+-----------+------------+-----------+----------+----+-----+------------+-------+
|2022-09-22| 2022-09-25| 5| 22| 265|2022-09-30|2022| 9| 38| 3|
|2022-09-23| 2022-09-25| 6| 23| 266|2022-09-30|2022| 9| 38| 3|
|2022-09-24| 2022-09-25| 7| 24| 267|2022-09-30|2022| 9| 38| 3|
|2022-09-25| 2022-10-02| 1| 25| 268|2022-09-30|2022| 9| 38| 3|
|2022-09-26| 2022-10-02| 2| 26| 269|2022-09-30|2022| 9| 39| 3|
|2022-09-27| 2022-10-02| 3| 27| 270|2022-09-30|2022| 9| 39| 3|
|2022-09-28| 2022-10-02| 4| 28| 271|2022-09-30|2022| 9| 39| 3|
|2022-09-29| 2022-10-02| 5| 29| 272|2022-09-30|2022| 9| 39| 3|
|2022-09-30| 2022-10-02| 6| 30| 273|2022-09-30|2022| 9| 39| 3|
|2022-10-01| 2022-10-02| 7| 1| 274|2022-10-31|2022| 10| 39| 4|
|2022-10-02| 2022-10-09| 1| 2| 275|2022-10-31|2022| 10| 39| 4|
|2022-10-03| 2022-10-09| 2| 3| 276|2022-10-31|2022| 10| 40| 4|
|2022-10-04| 2022-10-09| 3| 4| 277|2022-10-31|2022| 10| 40| 4|
|2022-10-05| 2022-10-09| 4| 5| 278|2022-10-31|2022| 10| 40| 4|
|2022-10-06| 2022-10-09| 5| 6| 279|2022-10-31|2022| 10| 40| 4|
|2022-10-07| 2022-10-09| 6| 7| 280|2022-10-31|2022| 10| 40| 4|
|2022-10-08| 2022-10-09| 7| 8| 281|2022-10-31|2022| 10| 40| 4|
|2022-10-09| 2022-10-16| 1| 9| 282|2022-10-31|2022| 10| 40| 4|
|2022-10-10| 2022-10-16| 2| 10| 283|2022-10-31|2022| 10| 41| 4|
|2022-10-11| 2022-10-16| 3| 11| 284|2022-10-31|2022| 10| 41| 4|
+----------+-----------+-----------+------------+-----------+----------+----+-----+------------+-------+
only showing top 20 rows

Related

GroupBy a dataframe records and display all columns with PySpark

I have the following dataframe
dataframe - columnA, columnB, columnC, columnD, columnE
I want to groupBy columnC and then consider max value of columnE
dataframe .select('*').groupBy('columnC').max('columnE')
expected output
dataframe - columnA, columnB, columnC, columnD, columnE
Real output
dataframe - columnC, columnE
Why all columns in the dataframe are not displayed as expected ?
For Spark version >= 3.0.0 you can use max_by to select the additional columns.
import random
from pyspark.sql import functions as F
#create some testdata
df = spark.createDataFrame(
[[random.randint(1,3)] + random.sample(range(0, 30), 4) for _ in range(10)],
schema=["columnC", "columnB", "columnA", "columnD", "columnE"]) \
.select("columnA", "columnB", "columnC", "columnD", "columnE")
df.groupBy("columnC") \
.agg(F.max("columnE"),
F.expr("max_by(columnA, columnE) as columnA"),
F.expr("max_by(columnB, columnE) as columnB"),
F.expr("max_by(columnD, columnE) as columnD")) \
.show()
For the testdata
+-------+-------+-------+-------+-------+
|columnA|columnB|columnC|columnD|columnE|
+-------+-------+-------+-------+-------+
| 25| 20| 2| 0| 2|
| 14| 2| 2| 24| 6|
| 26| 13| 3| 2| 1|
| 5| 24| 3| 19| 17|
| 22| 5| 3| 14| 21|
| 24| 5| 1| 8| 4|
| 7| 22| 3| 16| 20|
| 6| 17| 1| 5| 7|
| 24| 22| 2| 8| 3|
| 4| 14| 1| 16| 11|
+-------+-------+-------+-------+-------+
the result is
+-------+------------+-------+-------+-------+
|columnC|max(columnE)|columnA|columnB|columnD|
+-------+------------+-------+-------+-------+
| 1| 11| 4| 14| 16|
| 3| 21| 22| 5| 14|
| 2| 6| 14| 2| 24|
+-------+------------+-------+-------+-------+
What you want to achieve can be done via WINDOW function. Not groupBy
partition your data by columnC
Order your data within each partition in desc (rank)
filter out your desired result.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
from pyspark.sql.functions import col
windowSpec = Window.partitionBy("columnC").orderBy(col("columnE").desc())
expectedDf = df.withColumn("rank", rank().over(windowSpec)) \
.filter(col("rank") == 1)
You might wanna restructure your question.

How to create duplicate values of each row and then insert a new dataframe?

How do I duplicate each row of my original dataframe and then add dataframe 2 so that my final output is: I am writing this in python in a pyspark dataframe.
What you want is cross join :
result = df1.crossJoin(df2)
result.show()
#+------+--------+------+-------+------------+-----------------+
#| name| address|salary|bonus %|allowances %|employee category|
#+------+--------+------+-------+------------+-----------------+
#| Tom| Chicago| 75000| 5| 5| onsite|
#| Tom| Chicago| 75000| 10| 10| off shore|
#|Martha|New york| 80000| 5| 5| onsite|
#|Martha|New york| 80000| 10| 10| off shore|
#|Samuel| Phoenix| 90000| 5| 5| onsite|
#|Samuel| Phoenix| 90000| 10| 10| off shore|
#| Rom| Dallas| 65000| 5| 5| onsite|
#| Rom| Dallas| 65000| 10| 10| off shore|
#+------+--------+------+-------+------------+-----------------+

Adding multiple columns in pyspark dataframe using a loop

I need to add a number of columns (4000) into the data frame in pyspark. I am using the withColumn function, but getting assertion error.
df3 = df2.withColumn("['ftr' + str(i) for i in range(0, 4000)]", [expr('ftr[' + str(x) + ']') for x in range(0, 4000)])
Not sure what is wrong.
We can use .select() instead of .withColumn() to use a list as input to create a similar result as chaining multiple .withColumn()'s. The ["*"] is used to select also every existing column in the dataframe.
import pyspark.sql.functions as F
df2:
+---+
|age|
+---+
| 10|
| 11|
| 13|
+---+
df3 = df2.select(["*"] + [F.lit(f"{x}").alias(f"ftr{x}") for x in range(0,10)])
Results in:
+---+----+----+----+----+----+----+----+----+----+----+
|age|ftr0|ftr1|ftr2|ftr3|ftr4|ftr5|ftr6|ftr7|ftr8|ftr9|
+---+----+----+----+----+----+----+----+----+----+----+
| 10| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|
| 11| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|
| 13| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|
+---+----+----+----+----+----+----+----+----+----+----+
Try to do something like this:
df2 = df3
for i in range(0, 4000):
df2 = df2.withColumn(f"ftr{i}", lit(f"frt{i}"))

Replace characters in column names in pyspark data frames

I have a dataframew like below in Pyspark
df = spark.createDataFrame([(2,'john',1,1),
(2,'john',1,2),
(3,'pete',8,3),
(3,'pete',8,4),
(5,'steve',9,5)],
['id','/na/me','val/ue', 'rank/'])
df.show()
+---+------+------+-----+
| id|/na/me|val/ue|rank/|
+---+------+------+-----+
| 2| john| 1| 1|
| 2| john| 1| 2|
| 3| pete| 8| 3|
| 3| pete| 8| 4|
| 5| steve| 9| 5|
+---+------+------+-----+
Now in this data frame I want to replace the column names where / to under scrore _. But if the / comes at the start or end of the column name then remove the / but don't replace with _.
I have done like below
for name in df.schema.names:
df = df.withColumnRenamed(name, name.replace('/', '_'))
>>> df
DataFrame[id: bigint, _na_me: string, val_ue: bigint, rank_: bigint]
>>>df.show()
+---+------+------+-----+
| id|_na_me|val_ue|rank_|
+---+------+------+-----+
| 2| john| 1| 1|
| 2| john| 1| 2|
| 3| pete| 8| 3|
| 3| pete| 8| 4|
| 5| steve| 9| 5|
+---+------+------+-----+
How can I achieve my desired result which is below
+---+------+------+-----+
| id| na_me|val_ue| rank|
+---+------+------+-----+
| 2| john| 1| 1|
| 2| john| 1| 2|
| 3| pete| 8| 3|
| 3| pete| 8| 4|
| 5| steve| 9| 5|
+---+------+------+-----+
Try with regular expression replace(re.sub) in python way.
import re
cols=[re.sub(r'(^_|_$)','',f.replace("/","_")) for f in df.columns]
df = spark.createDataFrame([(2,'john',1,1),
(2,'john',1,2),
(3,'pete',8,3),
(3,'pete',8,4),
(5,'steve',9,5)],
['id','/na/me','val/ue', 'rank/'])
df.toDF(*cols).show()
#+---+-----+------+----+
#| id|na_me|val_ue|rank|
#+---+-----+------+----+
#| 2| john| 1| 1|
#| 2| john| 1| 2|
#| 3| pete| 8| 3|
#| 3| pete| 8| 4|
#| 5|steve| 9| 5|
#+---+-----+------+----+
#or using for loop on schema.names
for name in df.schema.names:
df = df.withColumnRenamed(name, re.sub(r'(^_|_$)','',name.replace('/', '_')))
df.show()
#+---+-----+------+----+
#| id|na_me|val_ue|rank|
#+---+-----+------+----+
#| 2| john| 1| 1|
#| 2| john| 1| 2|
#| 3| pete| 8| 3|
#| 3| pete| 8| 4|
#| 5|steve| 9| 5|
#+---+-----+------+----+

grouping consecutive rows in PySpark Dataframe

I have the following example Spark DataFrame:
rdd = sc.parallelize([(1,"19:00:00", "19:30:00", 30), (1,"19:30:00", "19:40:00", 10),(1,"19:40:00", "19:43:00", 3), (2,"20:00:00", "20:10:00", 10), (1,"20:05:00", "20:15:00", 10),(1,"20:15:00", "20:35:00", 20)])
df = spark.createDataFrame(rdd, ["user_id", "start_time", "end_time", "duration"])
df.show()
+-------+----------+--------+--------+
|user_id|start_time|end_time|duration|
+-------+----------+--------+--------+
| 1| 19:00:00|19:30:00| 30|
| 1| 19:30:00|19:40:00| 10|
| 1| 19:40:00|19:43:00| 3|
| 2| 20:00:00|20:10:00| 10|
| 1| 20:05:00|20:15:00| 10|
| 1| 20:15:00|20:35:00| 20|
+-------+----------+--------+--------+
I want to group consecutive rows based on the start and end times. For instance, for the same user_id, if a row's start time is the same as the previous row's end time, I want to group them together and sum the duration.
The desired result is:
+-------+----------+--------+--------+
|user_id|start_time|end_time|duration|
+-------+----------+--------+--------+
| 1| 19:00:00|19:43:00| 43|
| 2| 20:00:00|20:10:00| 10|
| 1| 20:05:00|20:35:00| 30|
+-------+----------+--------+--------+
The first three rows of the dataframe were grouped together because they all correspond to user_id 1 and the start times and end times form a continuous timeline.
This was my initial approach:
Use the lag function to get the next start time:
from pyspark.sql.functions import *
from pyspark.sql import Window
import sys
# compute next start time
window = Window.partitionBy('user_id').orderBy('start_time')
df = df.withColumn("next_start_time", lag(df.start_time, -1).over(window))
df.show()
+-------+----------+--------+--------+---------------+
|user_id|start_time|end_time|duration|next_start_time|
+-------+----------+--------+--------+---------------+
| 1| 19:00:00|19:30:00| 30| 19:30:00|
| 1| 19:30:00|19:40:00| 10| 19:40:00|
| 1| 19:40:00|19:43:00| 3| 20:05:00|
| 1| 20:05:00|20:15:00| 10| 20:15:00|
| 1| 20:15:00|20:35:00| 20| null|
| 2| 20:00:00|20:10:00| 10| null|
+-------+----------+--------+--------+---------------+
get the difference between the current row's end time and the next row's start time:
time_fmt = "HH:mm:ss"
timeDiff = unix_timestamp('next_start_time', format=time_fmt) - unix_timestamp('end_time', format=time_fmt)
df = df.withColumn("difference", timeDiff)
df.show()
+-------+----------+--------+--------+---------------+----------+
|user_id|start_time|end_time|duration|next_start_time|difference|
+-------+----------+--------+--------+---------------+----------+
| 1| 19:00:00|19:30:00| 30| 19:30:00| 0|
| 1| 19:30:00|19:40:00| 10| 19:40:00| 0|
| 1| 19:40:00|19:43:00| 3| 20:05:00| 1320|
| 1| 20:05:00|20:15:00| 10| 20:15:00| 0|
| 1| 20:15:00|20:35:00| 20| null| null|
| 2| 20:00:00|20:10:00| 10| null| null|
+-------+----------+--------+--------+---------------+----------+
Now my idea was to use the sum function with a window to get the cumulative sum of duration and then do a groupBy. But my approach was flawed for many reasons.
Here's one approach:
Gather together rows into groups where a group is a set of rows with the same user_id that are consecutive (start_time matches previous end_time). Then you can use this group to do your aggregation.
A way to get here is by creating intermediate indicator columns to tell you if the user has changed or the time is not consecutive. Then perform a cumulative sum over the indicator column to create the group.
For example:
import pyspark.sql.functions as f
from pyspark.sql import Window
w1 = Window.orderBy("start_time")
df = df.withColumn(
"userChange",
(f.col("user_id") != f.lag("user_id").over(w1)).cast("int")
)\
.withColumn(
"timeChange",
(f.col("start_time") != f.lag("end_time").over(w1)).cast("int")
)\
.fillna(
0,
subset=["userChange", "timeChange"]
)\
.withColumn(
"indicator",
(~((f.col("userChange") == 0) & (f.col("timeChange")==0))).cast("int")
)\
.withColumn(
"group",
f.sum(f.col("indicator")).over(w1.rangeBetween(Window.unboundedPreceding, 0))
)
df.show()
#+-------+----------+--------+--------+----------+----------+---------+-----+
#|user_id|start_time|end_time|duration|userChange|timeChange|indicator|group|
#+-------+----------+--------+--------+----------+----------+---------+-----+
#| 1| 19:00:00|19:30:00| 30| 0| 0| 0| 0|
#| 1| 19:30:00|19:40:00| 10| 0| 0| 0| 0|
#| 1| 19:40:00|19:43:00| 3| 0| 0| 0| 0|
#| 2| 20:00:00|20:10:00| 10| 1| 1| 1| 1|
#| 1| 20:05:00|20:15:00| 10| 1| 1| 1| 2|
#| 1| 20:15:00|20:35:00| 20| 0| 0| 0| 2|
#+-------+----------+--------+--------+----------+----------+---------+-----+
Now that we have the group column, we can aggregate as follows to get the desired result:
df.groupBy("user_id", "group")\
.agg(
f.min("start_time").alias("start_time"),
f.max("end_time").alias("end_time"),
f.sum("duration").alias("duration")
)\
.drop("group")\
.show()
#+-------+----------+--------+--------+
#|user_id|start_time|end_time|duration|
#+-------+----------+--------+--------+
#| 1| 19:00:00|19:43:00| 43|
#| 1| 20:05:00|20:35:00| 30|
#| 2| 20:00:00|20:10:00| 10|
#+-------+----------+--------+--------+
Here is a working solution derived from Pault's answer:
Create the Dataframe:
rdd = sc.parallelize([(1,"19:00:00", "19:30:00", 30), (1,"19:30:00", "19:40:00", 10),(1,"19:40:00", "19:43:00", 3), (2,"20:00:00", "20:10:00", 10), (1,"20:05:00", "20:15:00", 10),(1,"20:15:00", "20:35:00", 20)])
df = spark.createDataFrame(rdd, ["user_id", "start_time", "end_time", "duration"])
df.show()
+-------+----------+--------+--------+
|user_id|start_time|end_time|duration|
+-------+----------+--------+--------+
| 1| 19:00:00|19:30:00| 30|
| 1| 19:30:00|19:40:00| 10|
| 1| 19:40:00|19:43:00| 3|
| 1| 20:05:00|20:15:00| 10|
| 1| 20:15:00|20:35:00| 20|
+-------+----------+--------+--------+
Create an indicator column that indicates whenever the time has changed, and use cumulative sum to give each group a unique id:
import pyspark.sql.functions as f
from pyspark.sql import Window
w1 = Window.partitionBy('user_id').orderBy('start_time')
df = df.withColumn(
"indicator",
(f.col("start_time") != f.lag("end_time").over(w1)).cast("int")
)\
.fillna(
0,
subset=[ "indicator"]
)\
.withColumn(
"group",
f.sum(f.col("indicator")).over(w1.rangeBetween(Window.unboundedPreceding, 0))
)
df.show()
+-------+----------+--------+--------+---------+-----+
|user_id|start_time|end_time|duration|indicator|group|
+-------+----------+--------+--------+---------+-----+
| 1| 19:00:00|19:30:00| 30| 0| 0|
| 1| 19:30:00|19:40:00| 10| 0| 0|
| 1| 19:40:00|19:43:00| 3| 0| 0|
| 1| 20:05:00|20:15:00| 10| 1| 1|
| 1| 20:15:00|20:35:00| 20| 0| 1|
+-------+----------+--------+--------+---------+-----+
Now GroupBy on user id and the group variable.
+-------+----------+--------+--------+
|user_id|start_time|end_time|duration|
+-------+----------+--------+--------+
| 1| 19:00:00|19:43:00| 43|
| 1| 20:05:00|20:35:00| 30|
+-------+----------+--------+--------+

Categories

Resources