Pyspark pass function as a parameter to UDF

Pyspark pass function as a parameter to UDF - python

I'm trying to create a UDF which takes another function as a parameter. But the execution ends up with an exception.
The code I run:
import pandas as pd
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.types import MapType, DataType, StringType
from pyspark.sql.functions import udf, struct, lit
import os
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
df_to_test = sqlContext.createDataFrame(
pd.DataFrame({
'inn': ['111', '222', '333'],
'field1': [1, 2, 3],
'field2': ['a', 'b', 'c']
}))
def foo_fun(row, b) -> str:
return 'a' + b()
def bar_fun():
return 'I am bar'
foo_fun_udf = udf(foo_fun, StringType())
df_to_test.withColumn(
'foo',
foo_fun_udf(struct([df_to_test[x] for x in df_to_test.columns]), bar_fun)
).show()
The exception:
Invalid argument, not a string or column: <function bar_fun at 0x7f0e69ce6268> of type <class 'function'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.
I tried to wrap bar_fun into udf with no success. Is there a way to pass function as a parameter?

You are not so far from the solution. Here is how I would do it :
def foo_fun_udf(func):
def foo_fun(row) -> str:
return 'a' + func()
out_udf = udf(foo_fun, StringType())
return out_udf
df_to_test.withColumn(
'foo',
foo_fun_udf(bar_fun)(struct([df_to_test[x] for x in df_to_test.columns]))
).show()

Related

How to add multiple columns to pyspark DF using pandas_udf with multiple source columns?

And I need to extract from utc_timestamp its date and its hour into two different columns depending on time zone. Time zone name is defined by id from configuration const variable.
Input DF Output DF
+-------------+--+ +-------------+--+----------+----+
|utc_timestamp|id| |utc_timestamp|id|date |hour|
+-------------+--+ +-------------+--+----------+----|
|1608000000782|1 | |1608000000782|1 |2020-12-14|20 |
+-------------+--+ +-------------+--+----------+----+
|1608000240782|2 | |1608000240782|2 |2020-12-15|11 |
+-------------+--+ +-------------+--+----------+----+
I have pandas_udf that allows me to extract one column at a time and I have to create it twice:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import DateType, IntegerType
import pandas as pd
import pytz
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
class TimezoneUdfProvider(object):
def __init__(self):
self.extract_date_udf = pandas_udf(self._extract_date, DateType(), PandasUDFType.SCALAR)
self.extract_hour_udf = pandas_udf(self._extract_hour, IntegerType(), PandasUDFType.SCALAR)
def _extract_date(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_date(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def _extract_hour(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_hour(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def extract_date(utc_timestamp: int, id: str):
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).date()
def extract_hour(utc_timestamp: int, id: str) -> int:
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).hour
def extract_from_utc(df: DataFrame) -> DataFrame:
timezone_udf1 = TimezoneUdfProvider()
df_with_date = df.withColumn('date', timezone_udf1.extract_date_udf(f.col(utc_timestamp), f.col(id)))
timezone_udf2 = TimezoneUdfProvider()
df_with_hour = df_with_date.withColumn('hour', timezone_udf2.extract_hour_udf(f.col(utc_timestamp), f.col(id)))
return df_with_hour
Is there a better way to do it? Without a need to use the same udf provider twice?

you can do this without using udf using spark inbuilt functions.
We can use create_map to map the dictionary and create new timezone column , then convert using from_unixtime and from_utc_timestamp using the timezone as the newly mapped column. Once we have the timestamp as per the timezones, we can then fetch Hour and date feilds.
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
import pyspark.sql.functions as F
from itertools import chain
map_exp = F.create_map([F.lit(i) for i in chain(*TIMEZONE_LIST.items())])
final = (df.withColumn("TimeZone", map_exp.getItem(col("id")))
.withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()
(3) Spark Jobs
final:pyspark.sql.dataframe.DataFrame = [utc_timestamp: long, id: long ... 3 more fields]
+-------------+---+---------------+----------+----+
|utc_timestamp| id| TimeZone| date|Hour|
+-------------+---+---------------+----------+----+
|1608000000782| 1|America/Chicago|2020-12-14| 20|
|1608000240782| 2| Asia/Tokyo|2020-12-15| 11|
+-------------+---+---------------+----------+----+
EDIT: replacing create_map with a udf:
import pyspark.sql.functions as F
from pyspark.sql.functions import StringType
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
def fun(x):
return TIMEZONE_LIST.get(x,None)
map_udf = F.udf(fun,StringType())
final = (df.withColumn("TimeZone", map_udf("id")).withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()

Python Pandas Unit Test is not recognized

I have an issue with unit test definition. I am going to test data frames and I do not understand why the following result is returned.
Result:
Ran 0 tests in 0.000s
OK
Script:
import unittest
import pandas as pd
from pandas._testing import assert_frame_equal
def df_minus(df_main:pd.DataFrame, df_subset:pd.DataFrame) -> pd.DataFrame :
return df_main
class TestDataFrameMinus(unittest.TestCase):
def df_minus_equal(self):
df_A = pd.DataFrame(data={
'col1': [1, 2, 3, 4]
}
)
df_B = pd.DataFrame(data={
'col1': [1, 2, 3]
}
)
df_result = pd.DataFrame(data={
'col1': [1, 2, 3]
}
)
assert_frame_equal(df_minus(df_A, df_B), df_result)
if __name__ == '__main__':
unittest.main()
Do you have any idea why the test is not visible?

You should name your methods with test_ prefix
def test_df_minus_equal(self):
pass

How to replace non ascii characters in pyspark dataframe [duplicate]

I need to delete accents from characters in Spanish and others languages from different datasets.
I already did a function based in the code provided in this post that removes special the accents. The problem is that the function is slow because it uses an UDF.
I'm just wondering if I can improve the performance of my function to get results in less time, because this is good for small dataframes but not for big ones.
Thanks in advance.
Here the code, you will be able to run it as it is presented:
# Importing sql types
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf, col
import unicodedata
# Building a simple dataframe:
schema = StructType([StructField("city", StringType(), True),
StructField("country", StringType(), True),
StructField("population", IntegerType(), True)])
countries = ['Venezuela', 'US#A', 'Brazil', 'Spain']
cities = ['Maracaibó', 'New York', ' São Paulo ', '~Madrid']
population = [37800000,19795791,12341418,6489162]
# Dataframe:
df = sqlContext.createDataFrame(list(zip(cities, countries, population)), schema=schema)
df.show()
class Test():
def __init__(self, df):
self.df = df
def clearAccents(self, columns):
"""This function deletes accents in strings column dataFrames,
it does not eliminate main characters, but only deletes special tildes.
:param columns String or a list of column names.
"""
# Filters all string columns in dataFrame
validCols = [c for (c, t) in filter(lambda t: t[1] == 'string', self.df.dtypes)]
# If None or [] is provided with column parameter:
if (columns == "*"): columns = validCols[:]
# Receives a string as an argument
def remove_accents(inputStr):
# first, normalize strings:
nfkdStr = unicodedata.normalize('NFKD', inputStr)
# Keep chars that has no other char combined (i.e. accents chars)
withOutAccents = u"".join([c for c in nfkdStr if not unicodedata.combining(c)])
return withOutAccents
function = udf(lambda x: remove_accents(x) if x != None else x, StringType())
exprs = [function(col(c)).alias(c) if (c in columns) and (c in validCols) else c for c in self.df.columns]
self.df = self.df.select(*exprs)
foo = Test(df)
foo.clearAccents(columns="*")
foo.df.show()

One possible improvement is to build a custom Transformer, which will handle Unicode normalization, and corresponding Python wrapper. It should reduce overall overhead of passing data between JVM and Python and doesn't require any modifications in Spark itself or access to private API.
On JVM side you'll need a transformer similar to this one:
package net.zero323.spark.ml.feature
import java.text.Normalizer
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{DataType, StringType}
class UnicodeNormalizer (override val uid: String)
extends UnaryTransformer[String, String, UnicodeNormalizer] {
def this() = this(Identifiable.randomUID("unicode_normalizer"))
private val forms = Map(
"NFC" -> Normalizer.Form.NFC, "NFD" -> Normalizer.Form.NFD,
"NFKC" -> Normalizer.Form.NFKC, "NFKD" -> Normalizer.Form.NFKD
)
val form: Param[String] = new Param(this, "form", "unicode form (one of NFC, NFD, NFKC, NFKD)",
ParamValidators.inArray(forms.keys.toArray))
def setN(value: String): this.type = set(form, value)
def getForm: String = $(form)
setDefault(form -> "NFKD")
override protected def createTransformFunc: String => String = {
val normalizerForm = forms($(form))
(s: String) => Normalizer.normalize(s, normalizerForm)
}
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == StringType, s"Input type must be string type but got $inputType.")
}
override protected def outputDataType: DataType = StringType
}
Corresponding build definition (adjust Spark and Scala versions to match your Spark deployment):
name := "unicode-normalization"
version := "1.0"
crossScalaVersions := Seq("2.11.12", "2.12.8")
organization := "net.zero323"
val sparkVersion = "2.4.0"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-mllib" % sparkVersion
)
On Python side you'll need a wrapper similar to this one.
from pyspark.ml.param.shared import *
# from pyspark.ml.util import keyword_only # in Spark < 2.0
from pyspark import keyword_only
from pyspark.ml.wrapper import JavaTransformer
class UnicodeNormalizer(JavaTransformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, form="NFKD", inputCol=None, outputCol=None):
super(UnicodeNormalizer, self).__init__()
self._java_obj = self._new_java_obj(
"net.zero323.spark.ml.feature.UnicodeNormalizer", self.uid)
self.form = Param(self, "form",
"unicode form (one of NFC, NFD, NFKC, NFKD)")
# kwargs = self.__init__._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, form="NFKD", inputCol=None, outputCol=None):
# kwargs = self.setParams._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
return self._set(**kwargs)
def setForm(self, value):
return self._set(form=value)
def getForm(self):
return self.getOrDefault(self.form)
Build Scala package:
sbt +package
include it when you start shell or submit. For example for Spark build with Scala 2.11:
bin/pyspark --jars path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar \
--driver-class-path path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar
and you should be ready to go. All what is left is a little bit of regexp magic:
from pyspark.sql.functions import regexp_replace
normalizer = UnicodeNormalizer(form="NFKD",
inputCol="text", outputCol="text_normalized")
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid")
]).toDF(["id", "text"])
(normalizer
.transform(df)
.select(regexp_replace("text_normalized", "\p{M}", ""))
.show())
## +--------------------------------------+
## |regexp_replace(text_normalized,\p{M},)|
## +--------------------------------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## +--------------------------------------+
Please note that this follows the same conventions as built in text transformers and is not null safe. You can easily correct for that by check for null in createTransformFunc.

Another way for doing using python Unicode Database :
import unicodedata
import sys
from pyspark.sql.functions import translate, regexp_replace
def make_trans():
matching_string = ""
replace_string = ""
for i in range(ord(" "), sys.maxunicode):
name = unicodedata.name(chr(i), "")
if "WITH" in name:
try:
base = unicodedata.lookup(name.split(" WITH")[0])
matching_string += chr(i)
replace_string += base
except KeyError:
pass
return matching_string, replace_string
def clean_text(c):
matching_string, replace_string = make_trans()
return translate(
regexp_replace(c, "\p{M}", ""),
matching_string, replace_string
).alias(c)
So now let's test it :
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid"),
(5, "São Paulo"), (6, "Maracaibó")
]).toDF(["id", "text"])
df.select(clean_text("text")).show()
## +---------------+
## | text|
## +---------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## | Sao Paulo|
## | Maracaibo|
## +---------------+
acknowledge #zero323

This solution is Python only, but is only useful if the number of possible accents is low (e.g. one single language like Spanish) and the character replacements are manually specified.
There seems to be no built-in way to do what you asked for directly without UDFs, however you can chain many regexp_replace calls to replace each possible accented character. I tested the performance of this solution and it turns out that it only runs faster if you have a very limited set of accents to replace. If that's the case it can be faster than UDFs because it is optimized outside of Python.
from pyspark.sql.functions import col, regexp_replace
accent_replacements_spanish = [
(u'á', 'a'), (u'Á', 'A'),
(u'é', 'e'), (u'É', 'E'),
(u'í', 'i'), (u'Í', 'I'),
(u'ò', 'o'), (u'Ó', 'O'),
(u'ú|ü', 'u'), (u'Ú|Ű', 'U'),
(u'ñ', 'n'),
# see http://stackoverflow.com/a/18123985/3810493 for other characters
# this will convert other non ASCII characters to a question mark:
('[^\x00-\x7F]', '?')
]
def remove_accents(column):
r = col(column)
for a, b in accent_replacements_spanish:
r = regexp_replace(r, a, b)
return r.alias('remove_accents(' + column + ')')
df = sqlContext.createDataFrame([['Olà'], ['Olé'], ['Núñez']], ['str'])
df.select(remove_accents('str')).show()
I haven't compared the performance with the other responses and this function is not as general, but it is at least worth considering because you don't need to add Scala or Java to your build process.

Here's my implementation.
Apart from accents I also remove speciach characters. Because I needed to pivot and save a table, and you can't save a table with column name that has " ,;{}()\n\t=\/" characters.
import re
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from unidecode import unidecode
spark = SparkSession.builder.getOrCreate()
data = [(1, " \\ / \\ {____} aŠdá_ \t = \n () asd ____aa 2134_ 23_"), (1, "N"), (2, "false"), (2, "1"), (3, "NULL"),
(3, None)]
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
df.show()
for col_name in ["txt"]:
tmp_dict = {}
for col_value in [row[0] for row in df.select(col_name).distinct().toLocalIterator()
if row[0] is not None]:
new_col_value = re.sub("[ ,;{}()\\n\\t=\\\/]", "_", col_value)
new_col_value = re.sub('_+', '_', new_col_value)
if new_col_value.startswith("_"):
new_col_value = new_col_value[1:]
if new_col_value.endswith("_"):
new_col_value = new_col_value[:-1]
new_col_value = unidecode(new_col_value)
tmp_dict[col_value] = new_col_value.lower()
df = df.na.replace(to_replace=tmp_dict, subset=[col_name])
df.show()
if you can't access external librares (like me) you can replace unidecode with
new_col_value = new_col_value.translate(str.maketrans(
"ä,ö,ü,ẞ,á,ä,č,ď,é,ě,í,ĺ,ľ,ň,ó,ô,ŕ,š,ť,ú,ů,ý,ž,Ä,Ö,Ü,ẞ,Á,Ä,Č,Ď,É,Ě,Í,Ĺ,Ľ,Ň,Ó,Ô,Ŕ,Š,Ť,Ú,Ů,Ý,Ž",
"a,o,u,s,a,a,c,d,e,e,i,l,l,n,o,o,r,s,t,u,u,y,z,A,O,U,S,A,A,C,D,E,E,I,L,L,N,O,O,R,S,T,U,U,Y,Z"))

How to resolve duplicate column names while joining two dataframes in PySpark?

I have a file A and B which are exactly the same. I am trying to perform inner and outer joins on these two dataframes. Since I have all the columns as duplicate columns, the existing answers were of no help.
The other questions that I have gone through contain a col or two as duplicate, my issue is that the whole files are duplicates of each other: both in data and in column names.
My code:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import DataFrameReader, DataFrameWriter
from datetime import datetime
import time
# #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
print("All imports were successful.")
df = spark.read.orc(
's3://****'
)
print("First dataframe read with headers set to True")
df2 = spark.read.orc(
's3://****'
)
print("Second dataframe read with headers set to True")
# df3 = df.join(df2, ['c_0'], "outer")
# df3 = df.join(
# df2,
# df["column_test_1"] == df2["column_1"],
# "outer"
# )
df3 = df.alias('l').join(df2.alias('r'), on='c_0') #.collect()
print("Dataframes have been joined successfully.")
output_file_path = 's3://****'
)
df3.write.orc(
output_file_path
)
print("Dataframe has been written to csv.")
job.commit()
The error that I am facing is:
pyspark.sql.utils.AnalysisException: u'Duplicate column(s): "c_4", "c_38", "c_13", "c_27", "c_50", "c_16", "c_23", "c_24", "c_1", "c_35", "c_30", "c_56", "c_34", "c_7", "c_46", "c_49", "c_57", "c_45", "c_31", "c_53", "c_19", "c_25", "c_10", "c_8", "c_14", "c_42", "c_20", "c_47", "c_36", "c_29", "c_15", "c_43", "c_32", "c_5", "c_37", "c_18", "c_54", "c_3", "__created_at__", "c_51", "c_48", "c_9", "c_21", "c_26", "c_44", "c_55", "c_2", "c_17", "c_40", "c_28", "c_33", "c_41", "c_22", "c_11", "c_12", "c_52", "c_6", "c_39" found, cannot save to file.;'
End of LogType:stdout

There is no shortcut here. Pyspark expects the left and right dataframes to have distinct sets of field names (with the exception of the join key).
One solution would be to prefix each field name with either a "left_" or "right_" as follows:
# Obtain columns lists
left_cols = df.columns
right_cols = df2.columns
# Prefix each dataframe's field with "left_" or "right_"
df = df.selectExpr([col + ' as left_' + col for col in left_cols])
df2 = df2.selectExpr([col + ' as right_' + col for col in right_cols])
# Perform join
df3 = df.alias('l').join(df2.alias('r'), on='c_0')

Here is a helper function to join two dataframes adding aliases:
def join_with_aliases(left, right, on, how, right_prefix):
renamed_right = right.selectExpr(
[
col + f" as {col}_{right_prefix}"
for col in df2.columns
if col not in on
]
+ on
)
right_on = [f"{x}{right_prefix}" for x in on]
return left.join(renamed_right, on=on, how=how)
and here an example in how to use it:
df1 = spark.createDataFrame([[1, "a"], [2, "b"], [3, "c"]], ("id", "value"))
df2 = spark.createDataFrame([[1, "a"], [2, "b"], [3, "c"]], ("id", "value"))
join_with_aliases(
left=df1,
right=df2,
on=["id"],
how="inner",
right_prefix="_right"
).show()
+---+-----+------------+
| id|value|value_right|
+---+-----+------------+
| 1| a| a|
| 3| c| c|
| 2| b| b|
+---+-----+------------+

I did something like this but in scala, you can convert the same into pyspark as well...
Rename the column names in each dataframe
dataFrame1.columns.foreach(columnName => {
dataFrame1 = dataFrame1.select(dataFrame1.columns.head, dataFrame1.columns.tail: _*).withColumnRenamed(columnName, s"left_$columnName")
})
dataFrame1.columns.foreach(columnName => {
dataFrame2 = dataFrame2.select(dataFrame2.columns.head, dataFrame2.columns.tail: _*).withColumnRenamed(columnName, s"right_$columnName")
})
Now join by mentioning the column names
resultDF = dataframe1.join(dataframe2, dataframe1("left_c_0") === dataframe2("right_c_0"))

Python (pyspark) - Function decorator for UDF definition

I'm actually trying to define UDF with accumulator inside it. Accumulator is used to save exceptions from my_function for later. I come up with a definition of udf with some arguments (returnType, accumulator).
I'd like to make it a bit more readable and reusable. How could I define a decorator function with the code below ?
from pyspark.sql import functions as F
from pyspark.accumulators import AccumulatorParam
from pyspark.sql.types import StringType, StructField, IntegerType, StructType
from pyspark.sql import Row
data = [
Row(word="foo", number=7),
Row(word="bar", number=13)]
schema = StructType([
StructField("word", StringType(), True),
StructField("number", IntegerType(), True)])
df = spark.createDataFrame(data, schema)
Creation of my custom accumulator
class ListParam(AccumulatorParam):
def zero(self, v):
return []
def addInPlace(self, variable, value):
variable.append(value)
return variable
accum = spark.sparkContext.accumulator([], ListParam())
Definition of my udf
def accumulator_udf(accumulator, returnType):
def my_function(x):
y = None
try:
y = (x / (x - 7))
except Exception as e:
accumulator.add(dict([('errorType', str(e)), ('Data', x)]))
return y
return F.udf(my_function, returnType)
my_udf = accumulator_udf(accumulator=element_div_acc, returnType=IntegerType())
Result
df.select(my_udf(df.number)).show()
+---------------+
|div_one(number)|
+---------------+
| null|
| 2|
+---------------+
print(accum.value)
> [[{'errorType': 'integer division or modulo by zero', 'Data': 7}], []]
What I've tried
After few readings, I found this post (https://www.thecodeship.com/patterns/guide-to-python-function-decorators/) which helped but it'm stuck with a NameError
def accumulator_udf(accumulator, returnType):
def func_wrapper(func):
return F.udf(func, returnType)
return func_wrapper
accum = spark.sparkContext.accumulator([], ListParam())
#accumulator_udf(accumulator=accum , returnType=IntegerType())
def my_function(x):
y = None
try:
y = (x / (x - 7))
except Exception as e:
accumulator.add(dict([('errorType', str(e)), ('Data', x)]))
return y
df.select(my_function(df.number)).show()
When trying this implementation, I have the following error :
NameError: global name 'accumulator' is not defined
How could I manage to be able to access 'accumulator' ?
Thanks !

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pyspark pass function as a parameter to UDF - python

You are not so far from the solution. Here is how I would do it : def foo_fun_udf(func): def foo_fun(row) -> str: return 'a' + func() out_udf = udf(foo_fun, StringType()) return out_udf df_to_test.withColumn( 'foo', foo_fun_udf(bar_fun)(struct([df_to_test[x] for x in df_to_test.columns])) ).show()

Related

How to add multiple columns to pyspark DF using pandas_udf with multiple source columns?

Python Pandas Unit Test is not recognized

How to replace non ascii characters in pyspark dataframe [duplicate]

How to resolve duplicate column names while joining two dataframes in PySpark?

Python (pyspark) - Function decorator for UDF definition

Categories

Resources