I have the following issue since I am a bit of a noob in pyspark. Based on conditions on values coming from two columns I want to populate a third column. The conditions are:
if semicolon contained in col2, check col1
if col1 == 1, take the value before the semicolon
if col1 == 2, take the value after the semicolon
if no semicolon in col1, take the value from col2 as-is
This is what the dataframe looks like.
col1
col2
col3
1
24.9;34.9
24.9
2
24.9;34.9
34.9
1
80.8;90.9
80.8
2
80.8;90.9
90.9
1
777
777
I made the following udf which gives the error Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
import pyspark.sql.functions as F
def split_by_semicolon_if_exists(col1,col2):
if (col1.contains(';') == True):
if col2 == 1:
result = F.substring(col1, 0, (F.length(col1) - F.expr('locate('';'', col1) - 1')))
if col2 == 2:
result = F.substring(col1, F.expr('locate('';'', col1) - 1'), (F.length(col1) - F.expr('locate('';'', col1) - 1')))
return result
else:
return col1
df = df.withColumn('col3',
split_by_semicolon_if_exists(df['col1'],
df['col2']))
I have built this udf by googling for the various functions so there probably are multiple issues with it. Can you please help me build a udf for this case?
Take a look at split function.
Using UDF:
spark = SparkSession.builder.getOrCreate()
data = [
{"col1": 1, "col2": "24.9;34.9"},
{"col1": 2, "col2": "24.9;34.9"},
{"col1": 1, "col2": "80.8;90.9"},
{"col1": 1, "col2": "777"},
]
df = spark.createDataFrame(data)
def get_value(item, value):
if ";" in value:
return value.split(";")[item - 1]
return value
df = df.withColumn("col3", F.udf(get_value, StringType())(F.col("col1"), F.col("col2")))
Without UDF:
df = df.withColumn(
"col3",
F.when(
F.col("col2").contains(";"), F.split("col2", ";").getItem(F.col("col1") - 1)
).otherwise(F.col("col2")),
)
Result:
root
|-- col1: long (nullable = true)
|-- col2: string (nullable = true)
|-- col3: string (nullable = true)
+----+---------+----+
|col1|col2 |col3|
+----+---------+----+
|1 |24.9;34.9|24.9|
|2 |24.9;34.9|34.9|
|1 |80.8;90.9|80.8|
|1 |777 |777 |
+----+---------+----+
You can use this code:
import pyspark.sql.functions as F
import pyspark.sql.types as T
df =spark.createDataFrame(
data = [(1, "24.9;34.9"),
(2,"24.9;34.9"),
(1,"80.8;90.9"),
(2,"80.8;90.9"),
(1,"777")],
schema=["col1","col2"])
df.show()
def split_by_semicolon_if_exists(col1,col2):
if ';' in col2 :
if col1 == 1:
result = col2.split(';')[0]
if col1 == 2:
result = col2.split(';')[1]
return result
else:
return col2
split_by_semicolon_if_exists_udf =F.udf(split_by_semicolon_if_exists , T.StringType())
df = df.withColumn('col3', split_by_semicolon_if_exists_udf(df['col1'], df['col2']))
df.show()
To use functions over a column of your dataframe you have to declare them as udf funcions with F.udf(function, return argument type)
You can check this documentation https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/
Also, python has easier functions to manage strings such as
if 'string' in stringVariable :
(True/ False if substring is present in main string)
You can also divide strings in a certain character using
string.split(';')
(Returns an array of the sepparated parts)
You can use expr without having to use a udf here. Since Python indexing starts from 0, you need to subtract 1 from your col1:
from pyspark.sql import functions as F
df.withColumn("Result",F.expr("""split(col2,';')[int(col1)-1]""")).show()
+----+---------+-----+------+
|col1| col2| col3|Result|
+----+---------+-----+------+
| 1|24.9;34.9| 24.9| 24.9|
| 2|24.9;34.9| 34.9| 34.9|
| 1|80.8;90.9| 80.8| 80.8|
| 2|80.8;90.9| 90.9| 90.9|
| 1| 777|777.0| 777|
+----+---------+-----+------+
The new column Result is same as your output in col3
Related
I have a data frame and a function that I want to run on every cell in my data frame:
def foo(x):
# does stuff to x
return x
foo_udf = udf(lambda x: foo(x), StringType())
df = df.withColumn("col1", foo_udf(col("col1")))
.withColumn("col2", foo_udf(col("col2")))
.withColumn("col3", foo_udf(col("col3")))
It simply modifies the data passed in and returns a new value to replace the passed in value.
However, there may be instances where an error will occur, for these instances, I have another column col4 which will store a boolean of whether or not the udf failed for that row.
My issue is that when this occurs, I have no way of accessing col4 for that given row.
You can do this on a partition level with mapPartition. I will use Fugue which will provide an easier interface to bring this to Spark.
First some setup:
from typing import List, Dict, Any, Iterable
import pandas as pd
def foo(x):
if x == "E":
raise ValueError()
return x + "_ran"
def logic(df: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
for row in df:
try:
x = foo(row["col1"])
y = foo(row["col2"])
z = foo(row["col3"])
# if it reaches here, we can update all
row["col1"] = x
row["col2"] = y
row["col3"] = z
row["col4"] = False
except:
row["col4"] = True
return df
foo() is your original function and logic() is a wrapper to only update the columns if every foo() call is successful. Annotating the function will guide Fugue to apply conversions. From here we can use Fugue's transform() to test on Pandas.
df = pd.DataFrame({"col1": ["A", "B", "C"], "col2": ["A", "B", "C"], "col3": ["D", "E", "F"]})
from fugue import transform
transform(df, logic, schema="*, col4:boolean")
The schema is a requirement for Spark operations. This is just a minimal expression and then Fugue handles it, and then we get a result:
col1 col2 col3 col4
A_ran A_ran D_ran False
B B E True
C_ran C_ran F_ran False
so we can bring it to Spark. We just need to supply a SparkSession.
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(df)
transform(sdf, logic, schema="*, col4:boolean", engine=spark).show()
You can only return/change a single column from an UDF. However, this column can be a StructType, containing the payload and an error flag. Then you can "unpack" the struct column into two (or more) normal columns.
from pyspark.sql import functions as F
from pyspark.sql import types as T
#some testdata
data = [['A', 4],
['B', 2],
['C', 5]]
df=spark.createDataFrame(data, ["id", "col1"])
#the udf
def foo(x):
if x == 5:
error=True
else:
error=False
return [x, error]
foo_udf = F.udf(lambda x: foo(x), returnType = T.StructType([
T.StructField("x", T.StringType(), False),
T.StructField("error", T.BooleanType(), False)
]))
#calling the udf and unpacking the return values
df.withColumn("col1", foo_udf("col1")) \
.withColumn("error", F.col("col1.error")) \
.withColumn("col1", F.col("col1.x")) \
.show()
Output:
+---+----+-----+
| id|col1|error|
+---+----+-----+
| A| 4|false|
| B| 2|false|
| C| 5| true|
+---+----+-----+
I have a pandas DataFrame df that contains a list of filename.
Here is an example :
print(df)
>>
+---------+---------+
| ID| Field|
+---------+---------+
| AAA.png| X|
| BBB.jpg| Y|
| CCC.png| Z|
+---------+---------+
From a given ID, which is the filename without the extension, I want to retrieve the value of the column Field.
For example, for my_id = BBB, I want to get the value Y.
To so, I tried the following thing :
my_id = BBB
field_value = df[df["ID"].str.split('.')[0] == my_id]["Field"]
But I get the error KeyError: False. I understand why I have this error but I don't know how I can do that in an other way.
First filter by boolean indexing with DataFrame.loc - output is Series:
field_value = df.loc[df["ID"].str.split('.').str[0] == my_id, "Field"]
And then for first value use next with iter:
first val = next(iter(field_value), 'no match')
If need all matched values in list:
L = field_value.tolist()
I tested with str.contains:
my_id="BBB"
field_values = df.loc[df["ID"].str.contains(my_id), "Field"]
print(field_values)
It can return multiple values as you can see. Also it is bullet prof for file names starting with ., like .AAA.png.
ID Field
0 AAA.png X
1 BBB.jpg Y
2 CCC.png Z
3 BBB.png K
1 Y
3 K
Name: Field, dtype: object
Using os.path.splitext
Ex:
import os
import pandas as pd
df = pd.DataFrame({"ID": ["AAA.png", "BBB.png", "CCC.png"],
"Field": ["X", "Y", "Z"]})
my_id = "BBB"
mask = df["ID"].apply(os.path.splitext).str[0] == my_id
print(df[mask]["Field"])
Output:
1 Y
Name: Field, dtype: object
I am using pyspark version 1.5.2. I have a pyspark dataframe with a column "id" as shown below:
id
------------
000001_128
000123_1_3
006745_8
000000_9_7
I want to count the number of '_' (underscores) in each row of the DF and perform a when operation such that if there is only 1 underscore in the string, I want to add '_1' as suffix, otherwise leave the value as it is. So the desired result would be :
id | new_id
------------------------
000001_128 | 000001_128_1
000123_1_3 | 000123_1_3
006745_8 | 006745_8_1
000000_9_7 | 000000_9_7
I am using pyspark.sql.functions for other operations.
Any help is appreciated!
Here's a non-udf approach:
You can use the same methodology from this answer to count the number of _ in each id, and use pyspark.sql.functions.when() to check if the count is equal to 1. If yes, use pyspark.sql.functions.format_string() to make the new_id, otherwise leave the column unchanged:
import pyspark.sql.functions as f
df.withColumn(
"new_id",
f.when(
(f.size(f.split("id", "_"))-1) == 1,
f.format_string("%s_1",f.col("id"))
).otherwise(f.col("id"))
).show()
#+----------+------------+
#| id| new_id|
#+----------+------------+
#|000001_128|000001_128_1|
#|000123_1_3| 000123_1_3|
#| 006745_8| 006745_8_1|
#|000000_9_7| 000000_9_7|
#+----------+------------+
from pyspark.sql.functions import udf
#udf(returnType='string')
def fmt(s):
return s if s.count('_')!=1 else f'{s}_1'
df.withColumn('id', fmt(df.id))
This creates my example dataframe:
df = sc.parallelize([('abc',),('def',)]).toDF() #(
df = df.selectExpr("_1 as one",)
df = df.withColumn("two", lit('z'))
df.show()
looking like this:
+---+---+
|one|two|
+---+---+
|abc| z|
|def| z|
+---+---+
now what I want to do is a series of SQL where like statements where column two is appended whether or not it matches
in "pseudo code" it looks like this:
for letter in ['a','b','c','d']:
df = df['two'].where(col('one').like("%{}%".format(letter))) += letter
finally resulting in a df looking like this:
+---+----+
|one| two|
+---+----+
|abc|zabc|
|def| zd|
+---+----+
If you are using a list of strings to subset your string column, you can best use broadcast variables. Let's start with a more realistic example where your string still contain spaces:
df = sc.parallelize([('a b c',),('d e f',)]).toDF()
df = df.selectExpr("_1 as one",)
df = df.withColumn("two", lit('z'))
Then we create a broadcast variable from a list of letters, and consequently define an udf that uses them to subset a list of strings; and finally concatenates them with the value in another column, returning one string:
letters = ['a','b','c','d']
letters_bd = sc.broadcast(letters)
def subs(col1, col2):
l_subset = [x for x in col1 if x in letters_bd.value]
return col2 + ' ' + ' '.join(l_subset)
subs_udf = udf(subs)
To apply the above, the string we are subsetting need to be converted to a list, so we use the function split() first and then apply our udf:
from pyspark.sql.functions import col, split
df.withColumn("three", split(col('one'), r'\W+')) \
.withColumn("three", subs_udf("three", "two")) \
.show()
+-----+---+-------+
| one|two| three|
+-----+---+-------+
|a b c| z|z a b c|
|d e f| z| z d|
+-----+---+-------+
Or without udf, using regexp_replace and concat if your letters can be comfortably fit into the regex expression.
from pyspark.sql.functions import regexp_replace, col, concat, lit
df.withColumn("three", concat(col('two'), lit(' '),
regexp_replace(col('one'), '[^abcd]', ' ')))
As a simplified example, I have a dataframe "df" with columns "col1,col2" and I want to compute a row-wise maximum after applying a function to each column :
def f(x):
return (x+1)
max_udf=udf(lambda x,y: max(x,y), IntegerType())
f_udf=udf(f, IntegerType())
df2=df.withColumn("result", max_udf(f_udf(df.col1),f_udf(df.col2)))
So if df:
col1 col2
1 2
3 0
Then
df2:
col1 col2 result
1 2 3
3 0 4
The above doesn't seem to work and produces "Cannot evaluate expression: PythonUDF#f..."
I'm absolutely positive "f_udf" works just fine on my table, and the main issue is with the max_udf.
Without creating extra columns or using basic map/reduce, is there a way to do the above entirely using dataframes and udfs? How should I modify "max_udf"?
I've also tried:
max_udf=udf(max, IntegerType())
which produces the same error.
I've also confirmed that the following works:
df2=(df.withColumn("temp1", f_udf(df.col1))
.withColumn("temp2", f_udf(df.col2))
df2=df2.withColumn("result", max_udf(df2.temp1,df2.temp2))
Why is it that I can't do these in one go?
I would like to see an answer that generalizes to any function "f_udf" and "max_udf."
I had a similar problem and found the solution in the answer to this stackoverflow question
To pass multiple columns or a whole row to an UDF use a struct:
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import IntegerType
df = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
count_empty_columns = udf(lambda row: len([x for x in row if x == None]), IntegerType())
new_df = df.withColumn("null_count", count_empty_columns(struct([df[x] for x in df.columns])))
new_df.show()
returns:
+----+----+----------+
| a| b|null_count|
+----+----+----------+
|null|null| 2|
| 1|null| 1|
|null| 2| 1|
+----+----+----------+
UserDefinedFunction is throwing error while accepting UDFs as their arguments.
You can modify the max_udf like below to make it work.
df = sc.parallelize([(1, 2), (3, 0)]).toDF(["col1", "col2"])
max_udf = udf(lambda x, y: max(x + 1, y + 1), IntegerType())
df2 = df.withColumn("result", max_udf(df.col1, df.col2))
Or
def f_udf(x):
return (x + 1)
max_udf = udf(lambda x, y: max(x, y), IntegerType())
## f_udf=udf(f, IntegerType())
df2 = df.withColumn("result", max_udf(f_udf(df.col1), f_udf(df.col2)))
Note:
The second approach is valid if and only if internal functions (here f_udf) generate valid SQL expressions.
It works here because f_udf(df.col1) and f_udf(df.col2) are evaluated as Column<b'(col1 + 1)'> and Column<b'(col2 + 1)'> respectively, before being passed to max_udf. It wouldn't work with arbitrary function.
It wouldn't work if we try for example something like this:
from math import exp
df.withColumn("result", max_udf(exp(df.col1), exp(df.col2)))
The best way to handle this is to escape the pyspark.sql.DataFrame representation and use pyspark.RDDs via pyspark.sql.Row.asDict() and [pyspark.RDD.map()](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.map.html#pyspark.RDD.map).
import typing
# Save yourself some pain and always import these things: functions as F and types as T
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Row, SparkSession, SQLContext
spark = (
SparkSession.builder.appName("Stack Overflow Example")
.getOrCreate()
)
sc = spark.sparkContext
# sqlContet is needed sometimes to create DataFrames from RDDs
sqlContext = SQLContext(sc)
df = sc.parallelize([Row(**{"a": "hello", "b": 1, "c": 2}), Row(**{"a": "goodbye", "b": 2, "c": 1})]).toDF(["a", "b", "c"])
def to_string(record:dict) -> Row:
"""Create a readable string representation of the record"""
record["readable"] = f'Word: {record["a"]} A: {record["b"]} B: {record["c"]}'
return Row(**record)
# Apply the function with a map after converting the Row to a dict
readable_rdd = df.rdd.map(lambda x: x.asDict()).map(to_string)
# Test the function without running the entire DataFrame through it
print(readable_rdd.first())
# This results in: Row(a='hello', b=1, c=2, readable='Word: hello A: 1 B: 2')
# Sometimes you can use `toDF()` to get a dataframe
readable_df = readable_rdd.toDF()
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
# Sometimes you have to use createDataFrame with a specified schema
schema = T.StructType(
[
T.StructField("a", T.StringType(), True),
T.StructField("b", T.IntegerType(), True),
T.StructField("c", T.StringType(), True),
T.StructField("readable", T.StringType(), True),
]
)
# This is more reliable, you should use it in production!
readable_df = sqlContext.createDataFrame(readable_rdd, schema)
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
Sometimes RDD.map() functions can't use certain Python libraries because mappers get serialized and so you need to partition the data into enough partitions to occupy all the cores of the cluster and then use pyspark.RDD.mapPartition() to process an entire partition (just an Iterable of dicts) at a time. This enables you to instantiate an expensive object once - like a spaCy Language model - and apply it to one record at a time without recreating it.
def to_string_partition(partition:typing.Iterable[dict]) -> typing.Iterable[Row]:
"""Add a readable string form to an entire partition"""
# Instantiate expensive objects here
# Apply these objects' methods here
for record in partition:
record["readable"] = f'Word: {record["a"]} A: {record["b"]} B: {record["c"]}'
yield Row(**record)
readable_rdd = df.rdd.map(lambda x: x.asDict()).mapPartitions(to_string_partition)
print(readable_rdd.first())
# Row(a='hello', b=1, c=2, readable='Word: hello A: 1 B: 2')
# mapPartitions are more likely to require a specified schema
schema = T.StructType(
[
T.StructField("a", T.StringType(), True),
T.StructField("b", T.IntegerType(), True),
T.StructField("c", T.StringType(), True),
T.StructField("readable", T.StringType(), True),
]
)
# This is more reliable, you should use it in production!
readable_df = sqlContext.createDataFrame(readable_rdd, schema)
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
The DataFrame APIs are good because they allow SQL-like operations to be faster, but sometimes you need the power of direct Python without any limitations and it will greatly benefit your analytics practice to learn to employ RDDs. You can group records for example and then evaluate the entire group in RAM, just so long as it fits - which you can arrange by altering the partition key and limiting workers/increasing their RAM.
import numpy as np
def median_b(x):
"""Process a group and determine the median value"""
key = x[0]
values = x[1]
# Get the median value
m = np.median([record["b"] for record in values])
# Return a Row of the median for each group
return Row(**{"a": key, "median_b": m})
median_b_rdd = df.rdd.map(lambda x: x.asDict()).groupBy(lambda x: x["a"]).map(median_b)
median_b_rdd.first()
# Row(a='hello', median_b=1.0)
Below a useful code especially made to create any new column by simply calling a top-level business rule, completely isolated from the technical and heavy Spark's stuffs (no need to spend $ and to feel dependant of Databricks libraries anymore).
My advice is, in your organization try to do things simply and cleanly in life, for the benefits of top-level data users:
def createColumnFromRule(df, columnName, ruleClass, ruleName, inputColumns=None, inputValues=None, columnType=None):
from pyspark.sql import functions as F
from pyspark.sql import types as T
def _getSparkClassType(shortType):
defaultSparkClassType = "StringType"
typesMapping = {
"bigint" : "LongType",
"binary" : "BinaryType",
"boolean" : "BooleanType",
"byte" : "ByteType",
"date" : "DateType",
"decimal" : "DecimalType",
"double" : "DoubleType",
"float" : "FloatType",
"int" : "IntegerType",
"integer" : "IntegerType",
"long" : "LongType",
"numeric" : "NumericType",
"string" : defaultSparkClassType,
"timestamp" : "TimestampType"
}
sparkClassType = None
try:
sparkClassType = typesMapping[shortType]
except:
sparkClassType = defaultSparkClassType
return sparkClassType
if (columnType != None): sparkClassType = _getSparkClassType(columnType)
else: sparkClassType = "StringType"
aUdf = eval("F.udf(ruleClass." + ruleName + ", T." + sparkClassType + "())")
columns = None
values = None
if (inputColumns != None): columns = F.struct([df[column] for column in inputColumns])
if (inputValues != None): values = F.struct([F.lit(value) for value in inputValues])
# Call the rule
if (inputColumns != None and inputValues != None): df = df.withColumn(columnName, aUdf(columns, values))
elif (inputColumns != None): df = df.withColumn(columnName, aUdf(columns, F.lit(None)))
elif (inputValues != None): df = df.withColumn(columnName, aUdf(F.lit(None), values))
# Create a Null column otherwise
else:
if (columnType != None):
df = df.withColumn(columnName, F.lit(None).cast(columnType))
else:
df = df.withColumn(columnName, F.lit(None))
# Return the resulting dataframe
return df
Usage example:
# Define your business rule (you can get columns and values)
class CustomerRisk:
def churnRisk(self, columns=None, values=None):
isChurnRisk = False
# ... Rule implementation starts here
if (values != None):
if (values[0] == "FORCE_CHURN=true"): isChurnRisk = True
if (isChurnRisk == False and columns != None):
if (columns["AGE"]) <= 25): isChurnRisk = True
# ...
return isChurnRisk
# Execute the rule, it will create your new column in one line of code, that's all, easy isn't ?
# And look how to pass columns and values, it's really easy !
df = createColumnFromRule(df, columnName="CHURN_RISK", ruleClass=CustomerRisk(), ruleName="churnRisk", columnType="boolean", inputColumns=["NAME", "AGE", "ADDRESS"], inputValues=["FORCE_CHURN=true", "CHURN_RISK=100%"])