Getting a value from DataFrame based on other column value (PySpark) - python

I have a Spark dataframe which I want to get the statistics
stats_df = df.describe(['mycol'])
stats_df.show()
+-------+------------------+
|summary| mycol|
+-------+------------------+
| count| 300|
| mean| 2243|
| stddev| 319.419860456123|
| min| 1400|
| max| 3100|
+-------+------------------+
How do I extract the values of min and max in mycol using the summary min max column values? How do I do it by number index?

You could easily assign a variable from a select on that dataframe.
x = stats_df.select('mycol').where('summary' == 'min')

Ok let's consider the following example :
from pyspark.sql.functions import rand, randn
df = sqlContext.range(1, 1000).toDF('mycol')
df.describe().show()
# +-------+-----------------+
# |summary| mycol|
# +-------+-----------------+
# | count| 999|
# | mean| 500.0|
# | stddev|288.5307609250702|
# | min| 1|
# | max| 999|
# +-------+-----------------+
If you want to access the row concerning stddev, per example, you'll just need to convert it into an RDD, collect it and convert it into a dictionary as following :
stats = dict(df.describe().map(lambda r : (r.summary,r.mycol)).collect())
print(stats['stddev'])
# 288.5307609250702

Related

Pyspark use partition or groupby with agg and datediff

I'm new to Pyspark.
I would like to find the products not seen after 10 days from the first day they entered the store. And create a column in dataframe and set it to 1 for these products and 0 for the rest.
First I need to group the data based on product_id, then find the maximum of the seen_date. And finally calculate the difference between import_date and max(seen_date) in the groups. And finally create a new column based on the value of date_diff in each group.
Following is the code I used to first get the difference between the import_date and seen_date, but it gives error:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
w = (Window()
.partitionBy(df.product_id)
.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))
df.withColumn("date_diff", F.datediff(F.max(F.from_unixtime(F.col("import_date")).over(w)), F.from_unixtime(F.col("seen_date"))))
Error:
AnalysisException: It is not allowed to use a window function inside an aggregate function. Please use the inner window function in a sub-query.
This is the rest of my code to define a new column based on the date_diff:
not_seen = udf(lambda x: 0 if x >10 else 1, IntegerType())
df = df.withColumn('not_seen', not_seen("date_diff"))
Q: Can someone provide a fix for this code or a better approach to solve this problem?
sample data generation:
columns = ["product_id","import_date", "seen_date"]
data = [("123", "2014-05-06", "2014-05-07"),
("123", "2014-05-06", "2014-06-11"),
("125", "2015-01-02", "2015-01-03"),
("125", "2015-01-02", "2015-01-04"),
("128", "2015-08-06", "2015-08-25")]
dfFromData2 = spark.createDataFrame(data).toDF(*columns)
dfFromData2 = dfFromData2.withColumn("import_date",F.unix_timestamp(F.col("import_date"),'yyyy-MM-dd'))
dfFromData2 = dfFromData2.withColumn("seen_date",F.unix_timestamp(F.col("seen_date"),'yyyy-MM-dd'))
+----------+-----------+----------+
|product_id|import_date| seen_date|
+----------+-----------+----------+
| 123| 1399334400|1399420800|
| 123| 1399334400|1402444800|
| 125| 1420156800|1420243200|
| 125| 1420156800|1420329600|
| 128| 1438819200|1440460800|
+----------+-----------+----------+
columns = ["product_id","import_date", "seen_date"]
data = [("123", "2014-05-06", "2014-05-07"),
("123", "2014-05-06", "2014-06-11"),
("125", "2015-01-02", "2015-01-03"),
("125", "2015-01-02", "2015-01-04"),
("128", "2015-08-06", "2015-08-25")]
df = spark.createDataFrame(data).toDF(*columns)
df = df.withColumn("import_date",F.to_date(F.col("import_date"),'yyyy-MM-dd'))
df = df.withColumn("seen_date",F.to_date(F.col("seen_date"),'yyyy-MM-dd'))
from pyspark.sql.window import Window
from pyspark.sql import functions as F
w = (Window()
.partitionBy(df.product_id)
.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))
df\
.withColumn('max_import_date', F.max(F.col("import_date")).over(w))\
.withColumn("date_diff", F.datediff(F.col('seen_date'), F.col('max_import_date')))\
.withColumn('not_seen', F.when(F.col('date_diff') > 10, 0).otherwise(1))\
.show()
+----------+-----------+----------+---------------+---------+--------+
|product_id|import_date| seen_date|max_import_date|date_diff|not_seen|
+----------+-----------+----------+---------------+---------+--------+
| 123| 2014-05-06|2014-05-07| 2014-05-06| 1| 1|
| 123| 2014-05-06|2014-06-11| 2014-05-06| 36| 0|
| 125| 2015-01-02|2015-01-03| 2015-01-02| 1| 1|
| 125| 2015-01-02|2015-01-04| 2015-01-02| 2| 1|
| 128| 2015-08-06|2015-08-25| 2015-08-06| 19| 0|
+----------+-----------+----------+---------------+---------+--------+
You can use the max windowing function to extract the max date.
dfFromData2 = dfFromData2.withColumn(
'not_seen',
F.expr('if(datediff(max(from_unixtime(seen_date)) over (partition by product_id), from_unixtime(import_date)) > 10, 1, 0)')
)
dfFromData2.show(truncate=False)
# +----------+-----------+----------+--------+
# |product_id|import_date|seen_date |not_seen|
# +----------+-----------+----------+--------+
# |125 |1420128000 |1420214400|0 |
# |125 |1420128000 |1420300800|0 |
# |123 |1399305600 |1399392000|1 |
# |123 |1399305600 |1402416000|1 |
# |128 |1438790400 |1440432000|1 |
# +----------+-----------+----------+--------+

Pyspark: sum over a window based on a condition

Consider the simple DataFrame:
from pyspark import SparkContext
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf, PandasUDFType
spark = SparkSession.builder.appName('Trial').getOrCreate()
simpleData = (("2000-04-17", "144", 1), \
("2000-07-06", "015", 1), \
("2001-01-23", "015", -1), \
("2001-01-18", "144", -1), \
("2001-04-17", "198", 1), \
("2001-04-18", "036", -1), \
("2001-04-19", "012", -1), \
("2001-04-19", "188", 1), \
("2001-04-25", "188", 1),\
("2001-04-27", "015", 1) \
)
columns= ["dates", "id", "eps"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)
Out:
root
|-- dates: string (nullable = true)
|-- id: string (nullable = true)
|-- eps: long (nullable = true)
+----------+---+---+
|dates |id |eps|
+----------+---+---+
|2000-04-17|144|1 |
|2000-07-06|015|1 |
|2001-01-23|015|-1 |
|2001-01-18|144|-1 |
|2001-04-17|198|1 |
|2001-04-18|036|-1 |
|2001-04-19|012|-1 |
|2001-04-19|188|1 |
|2001-04-25|188|1 |
|2001-04-27|015|1 |
+----------+---+---+
I would like to sum the values in the eps column over a rolling window keeping only the last value for any given ID in the id column. For example, defining a window of 5 rows and assuming we are on 2001-04-17, I want to sum only the last eps value for each given unique ID. In the 5 rows we have only 3 different ID, so the sum must be of 3 elements: -1 for the ID 144 (forth row), -1 for the ID 015 (third row) and 1 for the ID 198 (fifth row) for a total of -1.
In my mind, within the rolling window I should do something like F.sum(groupBy('id').agg(F.last('eps'))) that of course is not possible to achieve in a rolling window.
I obtained the desired result using a UDF.
#pandas_udf(IntegerType(), PandasUDFType.GROUPEDAGG)
def fun_sum(id, eps):
df = pd.DataFrame()
df['id'] = id
df['eps'] = eps
value = df.groupby('id').last().sum()
return value
And then:
w = Window.orderBy('dates').rowsBetween(-5,0)
df = df.withColumn('sum', fun_sum(F.col('id'), F.col('eps')).over(w))
The problem is that my dataset contains more than 8 milion rows and performing this task with this UDF takes about 2 hours.
I was wandering whether there is a way to achieve the same result with built-in PySpark functions avoiding using a UDF or at least whether there is a way to improve the performance of my UDF.
For completeness, the desired output should be:
+----------+---+---+----+
|dates |id |eps|sum |
+----------+---+---+----+
|2000-04-17|144|1 |1 |
|2000-07-06|015|1 |2 |
|2001-01-23|015|-1 |0 |
|2001-01-18|144|-1 |-2 |
|2001-04-17|198|1 |-1 |
|2001-04-18|036|-1 |-2 |
|2001-04-19|012|-1 |-3 |
|2001-04-19|188|1 |-1 |
|2001-04-25|188|1 |0 |
|2001-04-27|015|1 |0 |
+----------+---+---+----+
EDIT: the rseult must also be achievable using a .rangeBetween() window.
In case you haven't figured it out yet, here's one way of achieving it.
Assuming that df is defined and initialised the way you defined and initialised it in your question.
Import the required functions and classes:
from pyspark.sql.functions import row_number, col
from pyspark.sql.window import Window
Create the necessary WindowSpec:
window_spec = (
Window
# Partition by 'id'.
.partitionBy(df.id)
# Order by 'dates', latest dates first.
.orderBy(df.dates.desc())
)
Create a DataFrame with partitioned data:
partitioned_df = (
df
# Use the window function 'row_number()' to populate a new column
# containing a sequential number starting at 1 within a window partition.
.withColumn('row', row_number().over(window_spec))
# Only select the first entry in each partition (i.e. the latest date).
.where(col('row') == 1)
)
Just in case you want to double-check the data:
partitioned_df.show()
# +----------+---+---+---+
# | dates| id|eps|row|
# +----------+---+---+---+
# |2001-04-19|012| -1| 1|
# |2001-04-25|188| 1| 1|
# |2001-04-27|015| 1| 1|
# |2001-04-17|198| 1| 1|
# |2001-01-18|144| -1| 1|
# |2001-04-18|036| -1| 1|
# +----------+---+---+---+
Group and aggregate the data:
sum_rows = (
partitioned_df
# Aggragate data.
.groupBy()
# Sum all rows in 'eps' column.
.sum('eps')
# Get all records as a list of Rows.
.collect()
)
Get the result:
print(f"sum eps: {sum_rows[0][0]})
# sum eps: 0

Pyspark mapping regex

I have a pyspark dataframe, with text column.
I wanted to map the values which with a regex expression.
df = df.withColumn('mapped_col', regexp_replace('mapped_col', '.*-RH', 'RH'))
df = df.withColumn('mapped_col', regexp_replace('mapped_col', '.*-FI, 'FI'))
Plus I wanted to map specifics values according to a dictionnary, I did the following (mapper is from create_map()):
df = df.withColumn("mapped_col",mapper.getItem(F.col("action")))
Finaly the values which has not been mapped by the dictionnary or the regex expression, will be set null. I do not know how to do this part in accordance to the two others.
Is it possible to have like a dictionnary of regex expression so I can regroup the two 'functions'?
{".*-RH": "RH", ".*FI" : "FI"}
Original Output Example
+-----------------------------+
|message |
+-----------------------------+
|GDF2009 |
|GDF2014 |
|ADS-set |
|ADS-set |
|XSQXQXQSDZADAA5454546a45a4-FI|
|dadaccpjpifjpsjfefspolamml-FI|
|dqdazdaapijiejoajojp565656-RH|
|kijipiadoa
+-----------------------------+
Expected Output Example
+-----------------------------+-----------------------------+
|message |status|
+-----------------------------+-----------------------------+
|GDF2009 | GDF
|GDF2014 | GDF
|ADS/set | ADS
|ADS-set | ADS
|XSQXQXQSDZADAA5454546a45a4-FI| FI
|dadaccpjpifjpsjfefspolamml-FI| FI
|dqdazdaapijiejoajojp565656-RH| RH
|kijipiadoa | null or ??
So first 4th line are mapped with a dict, and the other are mapped using regex. Unmapped are null or ??
Thank you,
You can achieve it using contains function:
from pyspark.sql.types import StringType
df = spark.createDataFrame(
["GDF2009", "GDF2014", "ADS-set", "ADS-set", "XSQXQXQSDZADAA5454546a45a4-FI", "dadaccpjpifjpsjfefspolamml-FI",
"dqdazdaapijiejoajojp565656-RH", "kijipiadoa"], StringType()).toDF("message")
df.show()
names = ("GDF", "ADS", "FI", "RH")
def c(col, names):
return [f.when(f.col(col).contains(i), i).otherwise("") for i in names]
df.select("message", f.concat_ws("", f.array_remove(f.array(*c("message", names)), "")).alias("status")).show()
output:
+--------------------+
| message|
+--------------------+
| GDF2009|
| GDF2014|
| ADS-set|
| ADS-set|
|XSQXQXQSDZADAA545...|
|dadaccpjpifjpsjfe...|
|dqdazdaapijiejoaj...|
| kijipiadoa|
+--------------------+
+--------------------+------+
| message|status|
+--------------------+------+
| GDF2009| GDF|
| GDF2014| GDF|
| ADS-set| ADS|
| ADS-set| ADS|
|XSQXQXQSDZADAA545...| FI|
|dadaccpjpifjpsjfe...| FI|
|dqdazdaapijiejoaj...| RH|
| kijipiadoa| |
+--------------------+------+

Pyspark cumulative product using numpy

I want to perform a cumulative product, previous successful answers use logarithmic sums to the the deed. However, is there a way to use Numpy cumsum. I have tried with no clear result, here is my code:
import numpy as np
def cumulative_product (x):
"""Calculation of cumulative product using numpy function cumprod.
"""
return np.cumprod(float(x)).tolist()
spark_cumulative_product = udf(cumulative_product, ArrayType(DoubleType()))
# the dataset in question:
param.show()
Which gives me for example:
+--------------+-----+
|financial_year| wpi|
+--------------+-----+
| 2014|1.026|
| 2015|1.024|
| 2016|1.021|
| 2017|1.019|
| 2018|1.021|
+--------------+-----+
When applying
param = param.withColumn('cum_wpi', spark_cumulative_product(param_treasury['wpi']))
param.show()
I have that there are no changes i.e.
+--------------+-----+-------+
|financial_year| wpi|cum_wpi|
+--------------+-----+-------+
| 2014|1.026|[1.026]|
| 2015|1.024|[1.024]|
| 2016|1.021|[1.021]|
| 2017|1.019|[1.019]|
| 2018|1.021|[1.021]|
+--------------+-----+-------+
Can anyone help on what is going wrong or if there is a better way to do cumprod without using exp-sum-log
-Update:
The desired output is:
+--------------+-----+-------+
|financial_year| wpi|cum_wpi|
+--------------+-----+-------+
| 2014|1.026| 1.026 |
| 2015|1.024| 1.051 |
| 2016|1.021| 1.073 |
| 2017|1.019| 1.093 |
| 2018|1.021| 1.116 |
+--------------+-----+-------+
One way you can achieve this using cum_prod() pandas series function, using a pandas grouped map UDF.
Sample DataFrame:
#+--------------+-----+
#|financial_year| wpi|
#+--------------+-----+
#| 2014|1.026|
#| 2015|1.024|
#| 2016|1.021|
#| 2017|1.019|
#| 2018|1.021|
#+--------------+-----+
I will first create a dummy column, which will be similar to our cum_wpi. I will overwrite this dummy column in the pandas udf. The use of orderBy right before the groupby and apply is there to ensure that the dataframe is sorted on financial_year.
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
df1=df.withColumn("cum_wpi", F.lit(1.2456))
#pandas_udf(df1.schema, PandasUDFType.GROUPED_MAP)
def grouped_map(df1):
df1['cum_wpi']=df1['wpi'].cumprod().round(decimals=3)
return df1
df.orderBy(F.col("financial_year").asc())\
.groupby().apply(grouped_map).show()
#+--------------+-----+-------+
#|financial_year| wpi|cum_wpi|
#+--------------+-----+-------+
#| 2014|1.026| 1.026|
#| 2015|1.024| 1.051|
#| 2016|1.021| 1.073|
#| 2017|1.019| 1.093|
#| 2018|1.021| 1.116|
#+--------------+-----+-------+
UPDATE:
You can use aggregate as mentioned earlier by #pault, as long as we cast acc(accumulator) to double we can handle your values.
df.withColumn("cum_wpi", F.expr("""format_number(aggregate(collect_list(wpi)\
over (order by financial_year)\
,cast(1 as double),(acc,x)-> acc*x),3)"""))\
.show(truncate=False)
#+--------------+-----+-------+
#|financial_year|wpi |cum_wpi|
#+--------------+-----+-------+
#|2014 |1.026|1.026 |
#|2015 |1.024|1.051 |
#|2016 |1.021|1.073 |
#|2017 |1.019|1.093 |
#|2018 |1.021|1.116 |
#+--------------+-----+-------+

Use spark function result as input of another function

In my Spark application I have a dataframe with informations like
+------------------+---------------+
| labels | labels_values |
+------------------+---------------+
| ['l1','l2','l3'] | 000 |
| ['l3','l4','l5'] | 100 |
+------------------+---------------+
What I am trying to achieve is to create, given a label name as input a single_label_value column that takes the value for that label from the labels_values column.
For example, for label='l3' I would like to retrieve this output:
+------------------+---------------+--------------------+
| labels | labels_values | single_label_value |
+------------------+---------------+--------------------+
| ['l1','l2','l3'] | 000 | 0 |
| ['l3','l4','l5'] | 100 | 1 |
+------------------+---------------+--------------------+
Here's what I am attempting to use:
selected_label='l3'
label_position = F.array_position(my_df.labels, selected_label)
my_df= my_df.withColumn(
"single_label_value",
F.substring(my_df.labels_values, label_position, 1)
)
But I am getting an error because the substring function does not like the label_position argument.
Is there any way to combine these function outputs without writing an udf?
Hope, this will work for you.
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark=SparkSession.builder.getOrCreate()
mydata=[[['l1','l2','l3'],'000'], [['l3','l4','l5'],'100']]
df = spark.createDataFrame(mydata,schema=["lebels","lebel_values"])
selected_label='l3'
df2=df.select(
"*",
(array_position(df.lebels,selected_label)-1).alias("pos_val"))
df2.createOrReplaceTempView("temp_table")
df3=spark.sql("select *,substring(lebel_values,pos_val,1) as val_pos from temp_table")
df3.show()
+------------+------------+-------+-------+
| lebels|lebel_values|pos_val|val_pos|
+------------+------------+-------+-------+
|[l1, l2, l3]| 000| 2| 0|
|[l3, l4, l5]| 100| 0| 1|
+------------+------------+-------+-------+
This is giving location of the value. If you want exact index then you can use -1 from this value.
--Edited anser -> Worked with temp view. Still looking for solution using withColumn option. I hope, it will help you for now.
Edit2 -> Answer using dataframe.
df2=df.select(
"*",
(array_position(df.lebels,selected_label)-1).astype("int").alias("pos_val")
)
df3=df2.withColumn("asked_col",expr("substring(lebel_values,pos_val,1)"))
df3.show()
Try maybe:
import pyspark.sql.functions as f
from pyspark.sql.functions import *
selected_label='l3'
df=df.withColumn('single_label_value', f.substring(f.col('labels_values'), array_position(f.col('labels'), lit(selected_label))-1, 1))
df.show()
(for spark version >=2.4)
I think lit() was the function you were missing - you can use it to pass constant values to spark dataframes.

Categories

Resources