PySpark row-wise function composition - python

As a simplified example, I have a dataframe "df" with columns "col1,col2" and I want to compute a row-wise maximum after applying a function to each column :
def f(x):
return (x+1)
max_udf=udf(lambda x,y: max(x,y), IntegerType())
f_udf=udf(f, IntegerType())
df2=df.withColumn("result", max_udf(f_udf(df.col1),f_udf(df.col2)))
So if df:
col1 col2
1 2
3 0
Then
df2:
col1 col2 result
1 2 3
3 0 4
The above doesn't seem to work and produces "Cannot evaluate expression: PythonUDF#f..."
I'm absolutely positive "f_udf" works just fine on my table, and the main issue is with the max_udf.
Without creating extra columns or using basic map/reduce, is there a way to do the above entirely using dataframes and udfs? How should I modify "max_udf"?
I've also tried:
max_udf=udf(max, IntegerType())
which produces the same error.
I've also confirmed that the following works:
df2=(df.withColumn("temp1", f_udf(df.col1))
.withColumn("temp2", f_udf(df.col2))
df2=df2.withColumn("result", max_udf(df2.temp1,df2.temp2))
Why is it that I can't do these in one go?
I would like to see an answer that generalizes to any function "f_udf" and "max_udf."

I had a similar problem and found the solution in the answer to this stackoverflow question
To pass multiple columns or a whole row to an UDF use a struct:
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import IntegerType
df = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
count_empty_columns = udf(lambda row: len([x for x in row if x == None]), IntegerType())
new_df = df.withColumn("null_count", count_empty_columns(struct([df[x] for x in df.columns])))
new_df.show()
returns:
+----+----+----------+
| a| b|null_count|
+----+----+----------+
|null|null| 2|
| 1|null| 1|
|null| 2| 1|
+----+----+----------+

UserDefinedFunction is throwing error while accepting UDFs as their arguments.
You can modify the max_udf like below to make it work.
df = sc.parallelize([(1, 2), (3, 0)]).toDF(["col1", "col2"])
max_udf = udf(lambda x, y: max(x + 1, y + 1), IntegerType())
df2 = df.withColumn("result", max_udf(df.col1, df.col2))
Or
def f_udf(x):
return (x + 1)
max_udf = udf(lambda x, y: max(x, y), IntegerType())
## f_udf=udf(f, IntegerType())
df2 = df.withColumn("result", max_udf(f_udf(df.col1), f_udf(df.col2)))
Note:
The second approach is valid if and only if internal functions (here f_udf) generate valid SQL expressions.
It works here because f_udf(df.col1) and f_udf(df.col2) are evaluated as Column<b'(col1 + 1)'> and Column<b'(col2 + 1)'> respectively, before being passed to max_udf. It wouldn't work with arbitrary function.
It wouldn't work if we try for example something like this:
from math import exp
df.withColumn("result", max_udf(exp(df.col1), exp(df.col2)))

The best way to handle this is to escape the pyspark.sql.DataFrame representation and use pyspark.RDDs via pyspark.sql.Row.asDict() and [pyspark.RDD.map()](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.map.html#pyspark.RDD.map).
import typing
# Save yourself some pain and always import these things: functions as F and types as T
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Row, SparkSession, SQLContext
spark = (
SparkSession.builder.appName("Stack Overflow Example")
.getOrCreate()
)
sc = spark.sparkContext
# sqlContet is needed sometimes to create DataFrames from RDDs
sqlContext = SQLContext(sc)
df = sc.parallelize([Row(**{"a": "hello", "b": 1, "c": 2}), Row(**{"a": "goodbye", "b": 2, "c": 1})]).toDF(["a", "b", "c"])
def to_string(record:dict) -> Row:
"""Create a readable string representation of the record"""
record["readable"] = f'Word: {record["a"]} A: {record["b"]} B: {record["c"]}'
return Row(**record)
# Apply the function with a map after converting the Row to a dict
readable_rdd = df.rdd.map(lambda x: x.asDict()).map(to_string)
# Test the function without running the entire DataFrame through it
print(readable_rdd.first())
# This results in: Row(a='hello', b=1, c=2, readable='Word: hello A: 1 B: 2')
# Sometimes you can use `toDF()` to get a dataframe
readable_df = readable_rdd.toDF()
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
# Sometimes you have to use createDataFrame with a specified schema
schema = T.StructType(
[
T.StructField("a", T.StringType(), True),
T.StructField("b", T.IntegerType(), True),
T.StructField("c", T.StringType(), True),
T.StructField("readable", T.StringType(), True),
]
)
# This is more reliable, you should use it in production!
readable_df = sqlContext.createDataFrame(readable_rdd, schema)
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
Sometimes RDD.map() functions can't use certain Python libraries because mappers get serialized and so you need to partition the data into enough partitions to occupy all the cores of the cluster and then use pyspark.RDD.mapPartition() to process an entire partition (just an Iterable of dicts) at a time. This enables you to instantiate an expensive object once - like a spaCy Language model - and apply it to one record at a time without recreating it.
def to_string_partition(partition:typing.Iterable[dict]) -> typing.Iterable[Row]:
"""Add a readable string form to an entire partition"""
# Instantiate expensive objects here
# Apply these objects' methods here
for record in partition:
record["readable"] = f'Word: {record["a"]} A: {record["b"]} B: {record["c"]}'
yield Row(**record)
readable_rdd = df.rdd.map(lambda x: x.asDict()).mapPartitions(to_string_partition)
print(readable_rdd.first())
# Row(a='hello', b=1, c=2, readable='Word: hello A: 1 B: 2')
# mapPartitions are more likely to require a specified schema
schema = T.StructType(
[
T.StructField("a", T.StringType(), True),
T.StructField("b", T.IntegerType(), True),
T.StructField("c", T.StringType(), True),
T.StructField("readable", T.StringType(), True),
]
)
# This is more reliable, you should use it in production!
readable_df = sqlContext.createDataFrame(readable_rdd, schema)
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
The DataFrame APIs are good because they allow SQL-like operations to be faster, but sometimes you need the power of direct Python without any limitations and it will greatly benefit your analytics practice to learn to employ RDDs. You can group records for example and then evaluate the entire group in RAM, just so long as it fits - which you can arrange by altering the partition key and limiting workers/increasing their RAM.
import numpy as np
def median_b(x):
"""Process a group and determine the median value"""
key = x[0]
values = x[1]
# Get the median value
m = np.median([record["b"] for record in values])
# Return a Row of the median for each group
return Row(**{"a": key, "median_b": m})
median_b_rdd = df.rdd.map(lambda x: x.asDict()).groupBy(lambda x: x["a"]).map(median_b)
median_b_rdd.first()
# Row(a='hello', median_b=1.0)

Below a useful code especially made to create any new column by simply calling a top-level business rule, completely isolated from the technical and heavy Spark's stuffs (no need to spend $ and to feel dependant of Databricks libraries anymore).
My advice is, in your organization try to do things simply and cleanly in life, for the benefits of top-level data users:
def createColumnFromRule(df, columnName, ruleClass, ruleName, inputColumns=None, inputValues=None, columnType=None):
from pyspark.sql import functions as F
from pyspark.sql import types as T
def _getSparkClassType(shortType):
defaultSparkClassType = "StringType"
typesMapping = {
"bigint" : "LongType",
"binary" : "BinaryType",
"boolean" : "BooleanType",
"byte" : "ByteType",
"date" : "DateType",
"decimal" : "DecimalType",
"double" : "DoubleType",
"float" : "FloatType",
"int" : "IntegerType",
"integer" : "IntegerType",
"long" : "LongType",
"numeric" : "NumericType",
"string" : defaultSparkClassType,
"timestamp" : "TimestampType"
}
sparkClassType = None
try:
sparkClassType = typesMapping[shortType]
except:
sparkClassType = defaultSparkClassType
return sparkClassType
if (columnType != None): sparkClassType = _getSparkClassType(columnType)
else: sparkClassType = "StringType"
aUdf = eval("F.udf(ruleClass." + ruleName + ", T." + sparkClassType + "())")
columns = None
values = None
if (inputColumns != None): columns = F.struct([df[column] for column in inputColumns])
if (inputValues != None): values = F.struct([F.lit(value) for value in inputValues])
# Call the rule
if (inputColumns != None and inputValues != None): df = df.withColumn(columnName, aUdf(columns, values))
elif (inputColumns != None): df = df.withColumn(columnName, aUdf(columns, F.lit(None)))
elif (inputValues != None): df = df.withColumn(columnName, aUdf(F.lit(None), values))
# Create a Null column otherwise
else:
if (columnType != None):
df = df.withColumn(columnName, F.lit(None).cast(columnType))
else:
df = df.withColumn(columnName, F.lit(None))
# Return the resulting dataframe
return df
Usage example:
# Define your business rule (you can get columns and values)
class CustomerRisk:
def churnRisk(self, columns=None, values=None):
isChurnRisk = False
# ... Rule implementation starts here
if (values != None):
if (values[0] == "FORCE_CHURN=true"): isChurnRisk = True
if (isChurnRisk == False and columns != None):
if (columns["AGE"]) <= 25): isChurnRisk = True
# ...
return isChurnRisk
# Execute the rule, it will create your new column in one line of code, that's all, easy isn't ?
# And look how to pass columns and values, it's really easy !
df = createColumnFromRule(df, columnName="CHURN_RISK", ruleClass=CustomerRisk(), ruleName="churnRisk", columnType="boolean", inputColumns=["NAME", "AGE", "ADDRESS"], inputValues=["FORCE_CHURN=true", "CHURN_RISK=100%"])

Related

Modify Different Pyspark Column on Exception in UDF

I have a data frame and a function that I want to run on every cell in my data frame:
def foo(x):
# does stuff to x
return x
foo_udf = udf(lambda x: foo(x), StringType())
df = df.withColumn("col1", foo_udf(col("col1")))
.withColumn("col2", foo_udf(col("col2")))
.withColumn("col3", foo_udf(col("col3")))
It simply modifies the data passed in and returns a new value to replace the passed in value.
However, there may be instances where an error will occur, for these instances, I have another column col4 which will store a boolean of whether or not the udf failed for that row.
My issue is that when this occurs, I have no way of accessing col4 for that given row.
You can do this on a partition level with mapPartition. I will use Fugue which will provide an easier interface to bring this to Spark.
First some setup:
from typing import List, Dict, Any, Iterable
import pandas as pd
def foo(x):
if x == "E":
raise ValueError()
return x + "_ran"
def logic(df: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
for row in df:
try:
x = foo(row["col1"])
y = foo(row["col2"])
z = foo(row["col3"])
# if it reaches here, we can update all
row["col1"] = x
row["col2"] = y
row["col3"] = z
row["col4"] = False
except:
row["col4"] = True
return df
foo() is your original function and logic() is a wrapper to only update the columns if every foo() call is successful. Annotating the function will guide Fugue to apply conversions. From here we can use Fugue's transform() to test on Pandas.
df = pd.DataFrame({"col1": ["A", "B", "C"], "col2": ["A", "B", "C"], "col3": ["D", "E", "F"]})
from fugue import transform
transform(df, logic, schema="*, col4:boolean")
The schema is a requirement for Spark operations. This is just a minimal expression and then Fugue handles it, and then we get a result:
col1 col2 col3 col4
A_ran A_ran D_ran False
B B E True
C_ran C_ran F_ran False
so we can bring it to Spark. We just need to supply a SparkSession.
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(df)
transform(sdf, logic, schema="*, col4:boolean", engine=spark).show()
You can only return/change a single column from an UDF. However, this column can be a StructType, containing the payload and an error flag. Then you can "unpack" the struct column into two (or more) normal columns.
from pyspark.sql import functions as F
from pyspark.sql import types as T
#some testdata
data = [['A', 4],
['B', 2],
['C', 5]]
df=spark.createDataFrame(data, ["id", "col1"])
#the udf
def foo(x):
if x == 5:
error=True
else:
error=False
return [x, error]
foo_udf = F.udf(lambda x: foo(x), returnType = T.StructType([
T.StructField("x", T.StringType(), False),
T.StructField("error", T.BooleanType(), False)
]))
#calling the udf and unpacking the return values
df.withColumn("col1", foo_udf("col1")) \
.withColumn("error", F.col("col1.error")) \
.withColumn("col1", F.col("col1.x")) \
.show()
Output:
+---+----+-----+
| id|col1|error|
+---+----+-----+
| A| 4|false|
| B| 2|false|
| C| 5| true|
+---+----+-----+

Compare a column against a dictionary in Dask

I have a dictionary:
dict = {10: 1, 50: 2, 200: 3, 500: 4}
And a Dask DataFrame:
+---+---+
| a| b|
+---+---+
| 1| 24|
| 1| 49|
| 2|125|
| 3|400|
+---+---+
I want to groupBy a and get the minimum b value. After that, I want to check which dict key is closest to b and create a new column with the dict value.
As a example, when b=24, the closest key is 10. So I want to assign the value 1.
This is the result I am expecting:
+---+---+-------+
| a| b|closest|
+---+---+-------+
| 1| 24| 1|
| 1| 49| 2|
| 2|125| 3|
| 3|400| 4|
+---+---+-------+
I have found something similar with PySpark. I have not been able to make it run, but it apparently run for other people. Sharing it anyway for reference.
df = spark.createDataFrame(
[
(1, 24),
(1, 49),
(2, 125),
(3, 400)
],
["a", "b"]
)
dict = {10:1, 50:2, 200: 3, 500: 4}
def func(value, dict):
closest_key = (
value if value in dict else builtins.min(
dict.keys(), key=lambda k: builtins.abs(k - value)
)
)
score = dict.get(closest_key)
return score
df = (
df.groupby('a')
.agg(
min('b')
)
).withColumn('closest', func('b', dict))
From what I understand, I think on the spark version the calculation was done per row and I have not been able to replicate that.
Instead of thinking of a row-rise operation, you can think of it as a partition-wise operation. If my interpretation is off, you can still use this sample I wrote for the most part with a few tweaks.
I will show a solution with Fugue that lets you just define your logic in Pandas, and then bring it to Dask. This will return a Dask DataFrame.
First some setup, note that df is a Pandas DataFrame. This is meant to represent a smaller sample you can test on:
import pandas as pd
import dask.dataframe as dd
import numpy as np
_dict = {10: 1, 50: 2, 200: 3, 500: 4}
df = pd.DataFrame({"a": [1,1,2,3], "b":[24,49,125,400]})
ddf = dd.from_pandas(df, npartitions=2)
and then we define the logic. This is written to handle one partition so everything in column a will already be the same value.
def logic(df: pd.DataFrame) -> pd.DataFrame:
# handles the logic for 1 group. all values in a are the same
min_b = df['b'].min()
keys = np.array(list(_dict.keys()))
# closest taken from https://stackoverflow.com/a/10465997/11163214
closest = keys[np.abs(keys - min_b).argmin()]
closest_val = _dict[closest]
df = df.assign(closest=closest_val)
return df
We can test this on Pandas:
logic(df.loc[df['a'] == 1])
and we'll get:
a b closest
0 1 24 1
1 1 49 1
So then we can just bring it to Dask with Fugue. We just need to call the transform function:
from fugue import transform
ddf = transform(ddf,
logic,
schema="*,closest:int",
partition={"by":"a"},
engine="dask")
ddf.compute()
This can take in either Pandas or Dask DataFrames and will output the Dask DataFrame because we specified the "dask" engine. There is also a "spark" engine if you want a Spark DataFrame.
Schema is a requirement for distributed computing so we specify the output schema here. We also partition by column a.
So here it is another approach for you friend, this will return a numpy array, but hey it will be faster than spark, and you can easily reindex it.
import numpy as np
a = pydf.toNumpy()
a = a[:,1] # Grabs your b column
np.where([a <=10,a <=50,a<=200,a<=500],[1,2,3,4],a) # Check the closest values and fill them with what you want

Optimization on for loop on columns in Pyspark

I don't know if my title is very clear. I have a table with a lot columns (more than a hundred). Some of my columns contains values with brackets and I need to explode them into several rows. Here is a reproducible example:
# Import libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *
import pandas as ps
# Create an example
columns = ["Name", "Age", "Activity", "Studies"]
data = [("Jame", 25, "[Painting,Yoga]", "[Math,Physics]"), ("Anne", 20, "[Garden,Cooking,Travel]", "[Communication,Marketing]"), ("Jane", 10, "[Gymnastique]", "[Basic School]")]
df = spark.createDataFrame(data=data,schema=columns)
df.show(truncate=False)
it shows the following table:
+----+---+-----------------------+-------------------------+
|Name|Age|Activity |Studies |
+----+---+-----------------------+-------------------------+
|Jame|25 |[Painting,Yoga] |[Math,Physics] |
|Anne|20 |[Garden,Cooking,Travel]|[Communication,Marketing]|
|Jane|10 |[Gymnastique] |[Basic School] |
+----+---+-----------------------+-------------------------+
I need to determine what columns contains brackets as value:
list_col = df.dtypes
df_array_col = spark.createDataFrame(list_col)\
.withColumnRenamed("_1", "Colname")\
.withColumnRenamed("_2", "TypeColumn")\
.filter(col("TypeColumn") == "string")\
.withColumn("IsBracket", lit(0))\
.toPandas()
# Function for determining what column contains brackets as a value
def func_isSquaredBracket(my_col):
A = df.select(first(col(my_col).rlike("\["), ignorenulls=True).alias(my_col))
val_IsBracket = A.select(col(my_col)).collect()[0][0]
return val_IsBracket
# For loop for applying the function
n_array = df_array_col.count()["Colname"]
for index, row in df_array_col.iterrows():
IsBracket_value = func_isSquaredBracket(df_array_col.at[index, "Colname"])
if IsBracket_value == True:
df_array_col.at[index, "IsBracket"] = 1
I succeed what columns have brackets as value. Now I can explode my table:
def func_extractStringInBracket_andSplit(my_col):
extract_string = regexp_extract(my_col, r'(?<=\[).+?(?=\])', 0).alias(my_col)
string_split = split(extract_string, "\||,").alias(my_col)
string_explode_array = explode_outer(string_split).alias(my_col)
return string_explode_array
df_explode_bracket = df
for index, row in df_array_bracket_col.iterrows():
colname = df_array_bracket_col["Colname"][index]
df_explode_bracket = df_explode_bracket.withColumn(colname, func_extractStringInBracket_andSplit(colname))
df_explode_bracket.show(truncate=False)
I obtain the result I want:
+----+---+-----------+-------------+
|Name|Age|Activity |Studies |
+----+---+-----------+-------------+
|Jame|25 |Painting |Math |
|Jame|25 |Painting |Physics |
|Jame|25 |Yoga |Math |
|Jame|25 |Yoga |Physics |
|Anne|20 |Garden |Communication|
|Anne|20 |Garden |Marketing |
|Anne|20 |Cooking |Communication|
|Anne|20 |Cooking |Marketing |
|Anne|20 |Travel |Communication|
|Anne|20 |Travel |Marketing |
|Jane|10 |Gymnastique|Basic School |
+----+---+-----------+-------------+
However, this solution is not optimized when I have more than 100 columns and it takes more than 6 minutes to get the result with the following message:
/opt/spark/python/lib/pyspark.zip/pyspark/sql/pandas/conversion.py:289: UserWarning: createDataFrame attempted Arrow optimization because 'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, failed by the reason below:
'JavaPackage' object is not callable
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
warnings.warn(msg)
I am pretty new to PySpark and I am not an expert in Python. My question is: How can I optimize the solution by using PySpark instead of Pandas? For loop is not ideal when you have the opportunity to use parallel processing.
It's actually pretty easy, use regexp_extract_all:
df = (
df.withColumn("Activity_list", F.expr(r"regexp_extract_all(Activity, '(\\w+)', 1)"))
.withColumn("Studies_list", F.expr(r"regexp_extract_all(Studies, '(\\w+)', 1)"))
)
df = (
df.drop("Activity", "Studies")
.withColumn("Activity", F.explode("Activity_list"))
.withColumn("Studies", F.explode("Studies_list"))
)
Edit: It even works with strings without brackets.

how to use round(col(),col()) in pyspark?

I want to use ROUND function like this:
CAST(ROUND(CostAmt,ISNULL(CurrencyDecimalPlaceNum)) AS decimal(32,8))
in pyspark.
In Dataframe and SQL ROUND function takes first argument as col and second argument as int number but I want to pass second argument as another column.
If i am trying to use second argument as col it is giving error column is not callable.
Pyspark code:
round(
col("CostAmt"),
coalesce(col("CurrencyDecimalPlaceNum").cast(IntegerType()), lit(2)),
).cast(DecimalType(23, 6))
how to solve this issue?
The round() function takes a column and an int as arguments: doc. The problem is that you are passing 2 columns as arguments since the coalesce returns a column.
I'm not sure how to do it using coalesce, I would use UDF and create a function that rounds the number and then apply it on both columns like this:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def round_value(value, scale):
if scale is None:
scale = 2
return round(value, scale)
if __name__ == "__main__":
spark = SparkSession.builder.master("local").appName("Test").getOrCreate()
df = spark.createDataFrame(
[
(1, 1, 2.3445),
(2, None, 168.454523),
(3, 4, 3500.345354),
],
["id", "CurrencyDecimalPlaceNum", "float_col"],
)
round_udf = F.udf(lambda x, y: round_value(x, y))
df = df.withColumn(
"round",
round_udf(
F.col("float_col"),
F.col("CurrencyDecimalPlaceNum"),
),
)
Result:
+---+-----------------------+-----------+---------+
| id|CurrencyDecimalPlaceNum| float_col| round|
+---+-----------------------+-----------+---------+
| 1| 1| 2.3445| 2.3|
| 2| null| 168.454523| 168.45|
| 3| 4|3500.345354|3500.3454|
+---+-----------------------+-----------+---------+

Python spark from DenseVector to columns [duplicate]

This question already has answers here:
How to access element of a VectorUDT column in a Spark DataFrame?
(5 answers)
Closed 7 months ago.
Context: I have a DataFrame with 2 columns: word and vector. Where the column type of "vector" is VectorUDT.
An Example:
word | vector
assert | [435,323,324,212...]
And I want to get this:
word | v1 | v2 | v3 | v4 | v5 | v6 ......
assert | 435 | 5435| 698| 356|....
Question:
How can I split a column with vectors in several columns for each dimension using PySpark ?
Thanks in advance
Spark >= 3.0.0
Since Spark 3.0.0 this can be done without using UDF.
from pyspark.ml.functions import vector_to_array
(df
.withColumn("xs", vector_to_array("vector")))
.select(["word"] + [col("xs")[i] for i in range(3)]))
## +-------+-----+-----+-----+
## | word|xs[0]|xs[1]|xs[2]|
## +-------+-----+-----+-----+
## | assert| 1.0| 2.0| 3.0|
## |require| 0.0| 2.0| 0.0|
## +-------+-----+-----+-----+
Spark < 3.0.0
One possible approach is to convert to and from RDD:
from pyspark.ml.linalg import Vectors
df = sc.parallelize([
("assert", Vectors.dense([1, 2, 3])),
("require", Vectors.sparse(3, {1: 2}))
]).toDF(["word", "vector"])
def extract(row):
return (row.word, ) + tuple(row.vector.toArray().tolist())
df.rdd.map(extract).toDF(["word"]) # Vector values will be named _2, _3, ...
## +-------+---+---+---+
## | word| _2| _3| _4|
## +-------+---+---+---+
## | assert|1.0|2.0|3.0|
## |require|0.0|2.0|0.0|
## +-------+---+---+---+
An alternative solution would be to create an UDF:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType
def to_array(col):
def to_array_(v):
return v.toArray().tolist()
# Important: asNondeterministic requires Spark 2.3 or later
# It can be safely removed i.e.
# return udf(to_array_, ArrayType(DoubleType()))(col)
# but at the cost of decreased performance
return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)
(df
.withColumn("xs", to_array(col("vector")))
.select(["word"] + [col("xs")[i] for i in range(3)]))
## +-------+-----+-----+-----+
## | word|xs[0]|xs[1]|xs[2]|
## +-------+-----+-----+-----+
## | assert| 1.0| 2.0| 3.0|
## |require| 0.0| 2.0| 0.0|
## +-------+-----+-----+-----+
For Scala equivalent see Spark Scala: How to convert Dataframe[vector] to DataFrame[f1:Double, ..., fn: Double)].
To split the rawPrediction or probability columns generated after training a PySpark ML model into Pandas columns, you can split like this:
your_pandas_df['probability'].apply(lambda x: pd.Series(x.toArray()))
It is much faster to use the i_th udf from how-to-access-element-of-a-vectorudt-column-in-a-spark-dataframe
The extract function given in the solution by zero323 above uses toList, which creates a Python list object, populates it with Python float objects, finds the desired element by traversing the list, which then needs to be converted back to java double; repeated for each row. Using the rdd is much slower than the to_array udf, which also calls toList, but both are much slower than a udf that lets SparkSQL handle most of the work.
Timing code comparing rdd extract and to_array udf proposed here to i_th udf from 3955864:
from pyspark.context import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import lit, udf, col
from pyspark.sql.types import ArrayType, DoubleType
import pyspark.sql.dataframe
from pyspark.sql.functions import pandas_udf, PandasUDFType
sc = SparkContext('local[4]', 'FlatTestTime')
spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", True)
from pyspark.ml.linalg import Vectors
# copy the two rows in the test dataframe a bunch of times,
# make this small enough for testing, or go for "big data" and be prepared to wait
REPS = 20000
df = sc.parallelize([
("assert", Vectors.dense([1, 2, 3]), 1, Vectors.dense([4.1, 5.1])),
("require", Vectors.sparse(3, {1: 2}), 2, Vectors.dense([6.2, 7.2])),
] * REPS).toDF(["word", "vector", "more", "vorpal"])
def extract(row):
return (row.word, ) + tuple(row.vector.toArray().tolist(),) + (row.more,) + tuple(row.vorpal.toArray().tolist(),)
def test_extract():
return df.rdd.map(extract).toDF(['word', 'vector__0', 'vector__1', 'vector__2', 'more', 'vorpal__0', 'vorpal__1'])
def to_array(col):
def to_array_(v):
return v.toArray().tolist()
return udf(to_array_, ArrayType(DoubleType()))(col)
def test_to_array():
df_to_array = df.withColumn("xs", to_array(col("vector"))) \
.select(["word"] + [col("xs")[i] for i in range(3)] + ["more", "vorpal"]) \
.withColumn("xx", to_array(col("vorpal"))) \
.select(["word"] + ["xs[{}]".format(i) for i in range(3)] + ["more"] + [col("xx")[i] for i in range(2)])
return df_to_array
# pack up to_array into a tidy function
def flatten(df, vector, vlen):
fieldNames = df.schema.fieldNames()
if vector in fieldNames:
names = []
for fieldname in fieldNames:
if fieldname == vector:
names.extend([col(vector)[i] for i in range(vlen)])
else:
names.append(col(fieldname))
return df.withColumn(vector, to_array(col(vector)))\
.select(names)
else:
return df
def test_flatten():
dflat = flatten(df, "vector", 3)
dflat2 = flatten(dflat, "vorpal", 2)
return dflat2
def ith_(v, i):
try:
return float(v[i])
except ValueError:
return None
ith = udf(ith_, DoubleType())
select = ["word"]
select.extend([ith("vector", lit(i)) for i in range(3)])
select.append("more")
select.extend([ith("vorpal", lit(i)) for i in range(2)])
# %% timeit ...
def test_ith():
return df.select(select)
if __name__ == '__main__':
import timeit
# make sure these work as intended
test_ith().show(4)
test_flatten().show(4)
test_to_array().show(4)
test_extract().show(4)
print("i_th\t\t",
timeit.timeit("test_ith()",
setup="from __main__ import test_ith",
number=7)
)
print("flatten\t\t",
timeit.timeit("test_flatten()",
setup="from __main__ import test_flatten",
number=7)
)
print("to_array\t",
timeit.timeit("test_to_array()",
setup="from __main__ import test_to_array",
number=7)
)
print("extract\t\t",
timeit.timeit("test_extract()",
setup="from __main__ import test_extract",
number=7)
)
Results:
i_th 0.05964796099999958
flatten 0.4842299350000001
to_array 0.42978780299999997
extract 2.9254476840000017
def splitVecotr(df, new_features=['f1','f2']):
schema = df.schema
cols = df.columns
for col in new_features: # new_features should be the same length as vector column length
schema = schema.add(col,DoubleType(),True)
return spark.createDataFrame(df.rdd.map(lambda row: [row[i] for i in cols]+row.features.tolist()), schema)
The function turns the feature vector column into separate columns

Categories

Resources