This question already has answers here:
How to access element of a VectorUDT column in a Spark DataFrame?
(5 answers)
Closed 7 months ago.
Context: I have a DataFrame with 2 columns: word and vector. Where the column type of "vector" is VectorUDT.
An Example:
word | vector
assert | [435,323,324,212...]
And I want to get this:
word | v1 | v2 | v3 | v4 | v5 | v6 ......
assert | 435 | 5435| 698| 356|....
Question:
How can I split a column with vectors in several columns for each dimension using PySpark ?
Thanks in advance
Spark >= 3.0.0
Since Spark 3.0.0 this can be done without using UDF.
from pyspark.ml.functions import vector_to_array
(df
.withColumn("xs", vector_to_array("vector")))
.select(["word"] + [col("xs")[i] for i in range(3)]))
## +-------+-----+-----+-----+
## | word|xs[0]|xs[1]|xs[2]|
## +-------+-----+-----+-----+
## | assert| 1.0| 2.0| 3.0|
## |require| 0.0| 2.0| 0.0|
## +-------+-----+-----+-----+
Spark < 3.0.0
One possible approach is to convert to and from RDD:
from pyspark.ml.linalg import Vectors
df = sc.parallelize([
("assert", Vectors.dense([1, 2, 3])),
("require", Vectors.sparse(3, {1: 2}))
]).toDF(["word", "vector"])
def extract(row):
return (row.word, ) + tuple(row.vector.toArray().tolist())
df.rdd.map(extract).toDF(["word"]) # Vector values will be named _2, _3, ...
## +-------+---+---+---+
## | word| _2| _3| _4|
## +-------+---+---+---+
## | assert|1.0|2.0|3.0|
## |require|0.0|2.0|0.0|
## +-------+---+---+---+
An alternative solution would be to create an UDF:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType
def to_array(col):
def to_array_(v):
return v.toArray().tolist()
# Important: asNondeterministic requires Spark 2.3 or later
# It can be safely removed i.e.
# return udf(to_array_, ArrayType(DoubleType()))(col)
# but at the cost of decreased performance
return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)
(df
.withColumn("xs", to_array(col("vector")))
.select(["word"] + [col("xs")[i] for i in range(3)]))
## +-------+-----+-----+-----+
## | word|xs[0]|xs[1]|xs[2]|
## +-------+-----+-----+-----+
## | assert| 1.0| 2.0| 3.0|
## |require| 0.0| 2.0| 0.0|
## +-------+-----+-----+-----+
For Scala equivalent see Spark Scala: How to convert Dataframe[vector] to DataFrame[f1:Double, ..., fn: Double)].
To split the rawPrediction or probability columns generated after training a PySpark ML model into Pandas columns, you can split like this:
your_pandas_df['probability'].apply(lambda x: pd.Series(x.toArray()))
It is much faster to use the i_th udf from how-to-access-element-of-a-vectorudt-column-in-a-spark-dataframe
The extract function given in the solution by zero323 above uses toList, which creates a Python list object, populates it with Python float objects, finds the desired element by traversing the list, which then needs to be converted back to java double; repeated for each row. Using the rdd is much slower than the to_array udf, which also calls toList, but both are much slower than a udf that lets SparkSQL handle most of the work.
Timing code comparing rdd extract and to_array udf proposed here to i_th udf from 3955864:
from pyspark.context import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import lit, udf, col
from pyspark.sql.types import ArrayType, DoubleType
import pyspark.sql.dataframe
from pyspark.sql.functions import pandas_udf, PandasUDFType
sc = SparkContext('local[4]', 'FlatTestTime')
spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", True)
from pyspark.ml.linalg import Vectors
# copy the two rows in the test dataframe a bunch of times,
# make this small enough for testing, or go for "big data" and be prepared to wait
REPS = 20000
df = sc.parallelize([
("assert", Vectors.dense([1, 2, 3]), 1, Vectors.dense([4.1, 5.1])),
("require", Vectors.sparse(3, {1: 2}), 2, Vectors.dense([6.2, 7.2])),
] * REPS).toDF(["word", "vector", "more", "vorpal"])
def extract(row):
return (row.word, ) + tuple(row.vector.toArray().tolist(),) + (row.more,) + tuple(row.vorpal.toArray().tolist(),)
def test_extract():
return df.rdd.map(extract).toDF(['word', 'vector__0', 'vector__1', 'vector__2', 'more', 'vorpal__0', 'vorpal__1'])
def to_array(col):
def to_array_(v):
return v.toArray().tolist()
return udf(to_array_, ArrayType(DoubleType()))(col)
def test_to_array():
df_to_array = df.withColumn("xs", to_array(col("vector"))) \
.select(["word"] + [col("xs")[i] for i in range(3)] + ["more", "vorpal"]) \
.withColumn("xx", to_array(col("vorpal"))) \
.select(["word"] + ["xs[{}]".format(i) for i in range(3)] + ["more"] + [col("xx")[i] for i in range(2)])
return df_to_array
# pack up to_array into a tidy function
def flatten(df, vector, vlen):
fieldNames = df.schema.fieldNames()
if vector in fieldNames:
names = []
for fieldname in fieldNames:
if fieldname == vector:
names.extend([col(vector)[i] for i in range(vlen)])
else:
names.append(col(fieldname))
return df.withColumn(vector, to_array(col(vector)))\
.select(names)
else:
return df
def test_flatten():
dflat = flatten(df, "vector", 3)
dflat2 = flatten(dflat, "vorpal", 2)
return dflat2
def ith_(v, i):
try:
return float(v[i])
except ValueError:
return None
ith = udf(ith_, DoubleType())
select = ["word"]
select.extend([ith("vector", lit(i)) for i in range(3)])
select.append("more")
select.extend([ith("vorpal", lit(i)) for i in range(2)])
# %% timeit ...
def test_ith():
return df.select(select)
if __name__ == '__main__':
import timeit
# make sure these work as intended
test_ith().show(4)
test_flatten().show(4)
test_to_array().show(4)
test_extract().show(4)
print("i_th\t\t",
timeit.timeit("test_ith()",
setup="from __main__ import test_ith",
number=7)
)
print("flatten\t\t",
timeit.timeit("test_flatten()",
setup="from __main__ import test_flatten",
number=7)
)
print("to_array\t",
timeit.timeit("test_to_array()",
setup="from __main__ import test_to_array",
number=7)
)
print("extract\t\t",
timeit.timeit("test_extract()",
setup="from __main__ import test_extract",
number=7)
)
Results:
i_th 0.05964796099999958
flatten 0.4842299350000001
to_array 0.42978780299999997
extract 2.9254476840000017
def splitVecotr(df, new_features=['f1','f2']):
schema = df.schema
cols = df.columns
for col in new_features: # new_features should be the same length as vector column length
schema = schema.add(col,DoubleType(),True)
return spark.createDataFrame(df.rdd.map(lambda row: [row[i] for i in cols]+row.features.tolist()), schema)
The function turns the feature vector column into separate columns
Related
I have a PySpark dataframe (say df) which has two columns ( Name and Score). Following is an example of the dataframe:
+------+-----+
| Name|Score|
+------+-----+
| name1|11.23|
| name2|14.57|
| name3| 2.21|
| name4| 8.76|
| name5|18.71|
+------+-----+
I have a numpy array (say bin_array) which has values close to the numerical values that are there in the column titled Score of the PySpark dataframe.
Following is the aforementioned numpy array:
bin_array = np.array([0, 5, 10, 15, 20])
I want to compare value from each row of the column Score with values in bin_array and store the closest value (gotten from bin_array) in a separate column in the PySpark dataframe.
Below is how I would like my new dataframe (say df_new) to look.
+------+-----+------------+
| Name|Score| Closest_bin|
+------+-----+------------+
| name1|11.23| 10.0 |
| name2|14.57| 15.0 |
| name3| 2.21| 0.0 |
| name4| 8.76| 10.0 |
| name5|18.71| 20.0 |
+------+-----+------------+
I have the below mentioned function which gives me the closest values from bin_array. The function works fine when I test it with individual numbers.
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return float(array[idx])
In my actual work, I will have millions of rows in the datafrmae. What is the most efficient way to create df_new?
Following are the steps that I tried to use to create user-defined function (udf) and the new data frame (df_new).
closest_bin_udf = F.udf( lambda x: find_nearest(array, x) )
df_new = df.withColumn( 'Closest_bin' , closest_bin_udf(df.Score) )
But, I got errors when I tried df_new.show(). A portion of the error is shown below.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-685c9b7e25d9> in <module>()
----> 1 df_new.show()
/usr/lib/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
376 """
377 if isinstance(truncate, bool) and truncate:
--> 378 print(self._jdf.showString(n, 20, vertical))
379 else:
380 print(self._jdf.showString(n, int(truncate), vertical))
You can use the below mentioned steps to create the aforementioned dataframe:
from pyspark.sql import *
import pyspark.sql.functions as F
import numpy as np
Stats = Row("Name", "Score")
stat1 = Stats('name1', 11.23)
stat2 = Stats('name2', 14.57)
stat3 = Stats('name3', 2.21)
stat4 = Stats('name4', 8.76)
stat5 = Stats('name5', 18.71)
stat_lst = [stat1 , stat2, stat3, stat4, stat5]
df = spark.createDataFrame(stat_lst)
df.show()
You can use a bucketizer from pyspark.mllib
from pyspark.sql import *
import pyspark.sql.functions as F
import numpy as np
Stats = Row("Name", "Score")
stat_lst = [Stats('name1', 11.23) , Stats('name2', 14.57), Stats('name3', 2.21), Stats('name4', 8.76), Stats('name5', 18.71)]
df = spark.createDataFrame(stat_lst)
from pyspark.ml.feature import Bucketizer
"""
Bucketizer creates bins like 0-5:0, 5-10:1, 10-15:2, 15-20:3
As I see, your expected output wants the closest numbered bin, so you might
have to change your buckets or the variable `t` below accordingly.
"""
bucket_list = [0, 5, 10, 15, 20]
bucketizer = Bucketizer(splits=bucket_list, inputCol="Score", outputCol="buckets")
df_buck = bucketizer.setHandleInvalid("keep").transform(df)
df_buck.show()
I am still working on getting the closest bin, I'll update my answer.
If you want your array values for each bucket you can use udf to create a new column with bucket names
from pyspark.sql.functions import udf
from pyspark.sql.types import *
t = dict(zip(range(len(bucket_list)), bucket_list))
udf_foo = udf(lambda x: t[x], IntegerType())
df_buck = df_buck.withColumn("score_bucket", udf_foo("buckets"))
Output
>>> df_buck.show()
+-----+-----+-------+------------+
| Name|Score|buckets|score_bucket|
+-----+-----+-------+------------+
|name1|11.23| 2.0| 10|
|name2|14.57| 2.0| 10|
|name3| 2.21| 0.0| 0|
|name4| 8.76| 1.0| 5|
|name5|18.71| 3.0| 15|
+-----+-----+-------+------------+
EDIT: Correcting the score buckets:
# Not dynamic, but please try to figure out this business logic according to your use-case
df_buck = df_buck.withColumn("correct_buckets", F.when(df_buck.Score-df_buck.score_bucket > 5/2, F.col("score_bucket") + 5).otherwise(F.col("score_bucket"))).drop("buckets", "score_bucket")
Now output is as expected:
+-----+-----+---------------+
| Name|Score|correct_buckets|
+-----+-----+---------------+
|name1|11.23| 10|
|name2|14.57| 15|
|name3| 2.21| 0|
|name4| 8.76| 10|
|name5|18.71| 20|
+-----+-----+---------------+
You can also pandas_udf although I'd suggest you test out the speed and memory consumption as you scale up
from pyspark.sql.functions import pandas_udf, PandasUDFType
import numpy as np
import pandas as pd
df = spark.createDataFrame(zip(["name_"+str(i) for i in range(1,6)], [11.23, 14.57, 2.21, 8.76, 18.71]), ["Name", "Score"])
bin_array = np.array([0, 5, 10, 15, 20])
#pandas_udf('double', PandasUDFType.SCALAR)
def find_nearest(value):
res = bin_array[np.newaxis, :] - value.values[:, np.newaxis]
ret_vals = [bin_array[np.argmin(np.abs(i))] for i in res]
return pd.Series(ret_vals)
df.withColumn('v2', find_nearest(df.Score)).show()
Output
+------+-----+----+
| Name|Score| v2|
+------+-----+----+
|name_1|11.23|10.0|
|name_2|14.57|15.0|
|name_3| 2.21| 0.0|
|name_4| 8.76|10.0|
|name_5|18.71|20.0|
+------+-----+----+
I have created a dataframe as shown
import ast
from pyspark.sql.functions import udf
values = [(u'['2','4','713',10),(u'['12','245']',20),(u'['101','12']',30)]
df = sqlContext.createDataFrame(values,['list','A'])
df.show()
+-----------------+---+
| list| A|
+-----------------+---+
|u'['2','4','713']| 10|
| u' ['12','245']| 20|
| u'['101','12',]| 30|
+-----------------+---+
**How can I convert the above dataframe such that each element in the list is a float and is within a proper list**
I tried the below one :
def df_amp_conversion(df_modelamp):
string_list_to_list = udf(lambda row: ast.literal_eval(str(row)))
df_modelamp = df_modelamp.withColumn('float_list',string_list_to_list(col("list")))
df2 = amp_conversion(df)
But the data remains the same without a change.
I dont want convert the dataframe to pandas or use collect as it is memory intensive.
And if possible try to give me an optimal solution.I am using pyspark
That's because you forgot about the type
udf(lambda row: ast.literal_eval(str(row)), "array<integer>")
Though something like this would be more efficient:
from pyspark.sql.functions import rtrim, ltrim, split
df = spark.createDataFrame(["""u'[23,4,77,890,4]"""], "string").toDF("list")
df.select(split(
regexp_replace("list", "^u'\\[|\\]$", ""), ","
).cast("array<integer>").alias("list")).show()
# +-------------------+
# | list|
# +-------------------+
# |[23, 4, 77, 890, 4]|
# +-------------------+
I can create the true result in python 3 with a little change in definition of function df_amp_conversion. You didn't return the value of df_modelamp! This code works for me properly:
import ast
from pyspark.sql.functions import udf, col
values = [(u"['2','4','713']",10),(u"['12','245']",20),(u"['101','12']",30)]
df = sqlContext.createDataFrame(values,['list','A'])
def df_amp_conversion(df_modelamp):
string_list_to_list = udf(lambda row: ast.literal_eval(str(row)))
df_modelamp = df_modelamp.withColumn('float_list',string_list_to_list(col("list")))
return df_modelamp
df2 = df_amp_conversion(df)
df2.show()
# +---------------+---+-----------+
# | list| A| float_list|
# +---------------+---+-----------+
# |['2','4','713']| 10|[2, 4, 713]|
# | ['12','245']| 20| [12, 245]|
# | ['101','12']| 30| [101, 12]|
# +---------------+---+-----------+
I want to know how to map values in a specific column in a dataframe.
I have a dataframe which looks like:
df = sc.parallelize([('india','japan'),('usa','uruguay')]).toDF(['col1','col2'])
+-----+-------+
| col1| col2|
+-----+-------+
|india| japan|
| usa|uruguay|
+-----+-------+
I have a dictionary from where I want to map the values.
dicts = sc.parallelize([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')])
The output I want is:
+-----+-------+--------+--------+
| col1| col2|col1_map|col2_map|
+-----+-------+--------+--------+
|india| japan| ind| jpn|
| usa|uruguay| us| urg|
+-----+-------+--------+--------+
I have tried using the lookup function but it doesn't work. It throws error SPARK-5063. Following is my approach which failed:
def map_val(x):
return dicts.lookup(x)[0]
myfun = udf(lambda x: map_val(x), StringType())
df = df.withColumn('col1_map', myfun('col1')) # doesn't work
df = df.withColumn('col2_map', myfun('col2')) # doesn't work
I think the easier way is just to use a simple dictionary and df.withColumn.
from itertools import chain
from pyspark.sql.functions import create_map, lit
simple_dict = {'india':'ind', 'usa':'us', 'japan':'jpn', 'uruguay':'urg'}
mapping_expr = create_map([lit(x) for x in chain(*simple_dict.items())])
df = df.withColumn('col1_map', mapping_expr[df['col1']])\
.withColumn('col2_map', mapping_expr[df['col2']])
df.show(truncate=False)
udf way
I would suggest you to change the list of tuples to dicts and broadcast it to be used in udf
dicts = sc.broadcast(dict([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')]))
from pyspark.sql import functions as f
from pyspark.sql import types as t
def newCols(x):
return dicts.value[x]
callnewColsUdf = f.udf(newCols, t.StringType())
df.withColumn('col1_map', callnewColsUdf(f.col('col1')))\
.withColumn('col2_map', callnewColsUdf(f.col('col2')))\
.show(truncate=False)
which should give you
+-----+-------+--------+--------+
|col1 |col2 |col1_map|col2_map|
+-----+-------+--------+--------+
|india|japan |ind |jpn |
|usa |uruguay|us |urg |
+-----+-------+--------+--------+
join way (slower than udf way)
All you have to do is change the dicts rdd to dataframe too and use two joins with aliasings as following
df = sc.parallelize([('india','japan'),('usa','uruguay')]).toDF(['col1','col2'])
dicts = sc.parallelize([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')]).toDF(['key', 'value'])
from pyspark.sql import functions as f
df.join(dicts, df['col1'] == dicts['key'], 'inner')\
.select(f.col('col1'), f.col('col2'), f.col('value').alias('col1_map'))\
.join(dicts, df['col2'] == dicts['key'], 'inner') \
.select(f.col('col1'), f.col('col2'), f.col('col1_map'), f.col('value').alias('col2_map'))\
.show(truncate=False)
which should give you the same result
Similar to Ali AzG, but pulling it all out into a handy little method if anyone finds it useful
from itertools import chain
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from typing import Dict
def map_column_values(df:DataFrame, map_dict:Dict, column:str, new_column:str="")->DataFrame:
"""Handy method for mapping column values from one value to another
Args:
df (DataFrame): Dataframe to operate on
map_dict (Dict): Dictionary containing the values to map from and to
column (str): The column containing the values to be mapped
new_column (str, optional): The name of the column to store the mapped values in.
If not specified the values will be stored in the original column
Returns:
DataFrame
"""
spark_map = F.create_map([F.lit(x) for x in chain(*map_dict.items())])
return df.withColumn(new_column or column, spark_map[df[column]])
This can be used as follows
from pyspark.sql import Row, SparkSession
spark = SparkSession.builder.master("local[3]").getOrCreate()
df = spark.createDataFrame([Row(A=0), Row(A=1)])
df = map_column_values(df, map_dict={0:"foo", 1:"bar"}, column="A", new_column="B")
df.show()
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#+---+---+
#| A| B|
#+---+---+
#| 0|foo|
#| 1|bar|
#+---+---+
I am currently working on PySpark with Databricks and I was looking for a way to truncate a string just like the excel right function does.
For example, I would like to change for an ID column in a DataFrame 8841673_3 into 8841673.
Does anybody knows how I should proceed?
Regular expressions with regexp_extract:
from pyspark.sql.functions import regexp_extract
df = spark.createDataFrame([("8841673_3", )], ("id", ))
df.select(regexp_extract("id", "^(\d+)_.*", 1)).show()
# +--------------------------------+
# |regexp_extract(id, ^(\d+)_.*, 1)|
# +--------------------------------+
# | 8841673|
# +--------------------------------+
regexp_replace:
from pyspark.sql.functions import regexp_replace
df.select(regexp_replace("id", "_.*$", "")).show()
# +--------------------------+
# |regexp_replace(id, _.*$, )|
# +--------------------------+
# | 8841673|
# +--------------------------+
or just split:
from pyspark.sql.functions import split
df.select(split("id", "_")[0]).show()
# +---------------+
# |split(id, _)[0]|
# +---------------+
# | 8841673|
# +---------------+
You can use the pyspark.sql.Column.substr method:
import pyspark.sql.functions as F
def left(x, n):
return x.substr(0, n)
def right(x, n):
x_len = F.length(x)
return x.substr(x_len - n, x_len)
As a simplified example, I have a dataframe "df" with columns "col1,col2" and I want to compute a row-wise maximum after applying a function to each column :
def f(x):
return (x+1)
max_udf=udf(lambda x,y: max(x,y), IntegerType())
f_udf=udf(f, IntegerType())
df2=df.withColumn("result", max_udf(f_udf(df.col1),f_udf(df.col2)))
So if df:
col1 col2
1 2
3 0
Then
df2:
col1 col2 result
1 2 3
3 0 4
The above doesn't seem to work and produces "Cannot evaluate expression: PythonUDF#f..."
I'm absolutely positive "f_udf" works just fine on my table, and the main issue is with the max_udf.
Without creating extra columns or using basic map/reduce, is there a way to do the above entirely using dataframes and udfs? How should I modify "max_udf"?
I've also tried:
max_udf=udf(max, IntegerType())
which produces the same error.
I've also confirmed that the following works:
df2=(df.withColumn("temp1", f_udf(df.col1))
.withColumn("temp2", f_udf(df.col2))
df2=df2.withColumn("result", max_udf(df2.temp1,df2.temp2))
Why is it that I can't do these in one go?
I would like to see an answer that generalizes to any function "f_udf" and "max_udf."
I had a similar problem and found the solution in the answer to this stackoverflow question
To pass multiple columns or a whole row to an UDF use a struct:
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import IntegerType
df = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
count_empty_columns = udf(lambda row: len([x for x in row if x == None]), IntegerType())
new_df = df.withColumn("null_count", count_empty_columns(struct([df[x] for x in df.columns])))
new_df.show()
returns:
+----+----+----------+
| a| b|null_count|
+----+----+----------+
|null|null| 2|
| 1|null| 1|
|null| 2| 1|
+----+----+----------+
UserDefinedFunction is throwing error while accepting UDFs as their arguments.
You can modify the max_udf like below to make it work.
df = sc.parallelize([(1, 2), (3, 0)]).toDF(["col1", "col2"])
max_udf = udf(lambda x, y: max(x + 1, y + 1), IntegerType())
df2 = df.withColumn("result", max_udf(df.col1, df.col2))
Or
def f_udf(x):
return (x + 1)
max_udf = udf(lambda x, y: max(x, y), IntegerType())
## f_udf=udf(f, IntegerType())
df2 = df.withColumn("result", max_udf(f_udf(df.col1), f_udf(df.col2)))
Note:
The second approach is valid if and only if internal functions (here f_udf) generate valid SQL expressions.
It works here because f_udf(df.col1) and f_udf(df.col2) are evaluated as Column<b'(col1 + 1)'> and Column<b'(col2 + 1)'> respectively, before being passed to max_udf. It wouldn't work with arbitrary function.
It wouldn't work if we try for example something like this:
from math import exp
df.withColumn("result", max_udf(exp(df.col1), exp(df.col2)))
The best way to handle this is to escape the pyspark.sql.DataFrame representation and use pyspark.RDDs via pyspark.sql.Row.asDict() and [pyspark.RDD.map()](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.map.html#pyspark.RDD.map).
import typing
# Save yourself some pain and always import these things: functions as F and types as T
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Row, SparkSession, SQLContext
spark = (
SparkSession.builder.appName("Stack Overflow Example")
.getOrCreate()
)
sc = spark.sparkContext
# sqlContet is needed sometimes to create DataFrames from RDDs
sqlContext = SQLContext(sc)
df = sc.parallelize([Row(**{"a": "hello", "b": 1, "c": 2}), Row(**{"a": "goodbye", "b": 2, "c": 1})]).toDF(["a", "b", "c"])
def to_string(record:dict) -> Row:
"""Create a readable string representation of the record"""
record["readable"] = f'Word: {record["a"]} A: {record["b"]} B: {record["c"]}'
return Row(**record)
# Apply the function with a map after converting the Row to a dict
readable_rdd = df.rdd.map(lambda x: x.asDict()).map(to_string)
# Test the function without running the entire DataFrame through it
print(readable_rdd.first())
# This results in: Row(a='hello', b=1, c=2, readable='Word: hello A: 1 B: 2')
# Sometimes you can use `toDF()` to get a dataframe
readable_df = readable_rdd.toDF()
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
# Sometimes you have to use createDataFrame with a specified schema
schema = T.StructType(
[
T.StructField("a", T.StringType(), True),
T.StructField("b", T.IntegerType(), True),
T.StructField("c", T.StringType(), True),
T.StructField("readable", T.StringType(), True),
]
)
# This is more reliable, you should use it in production!
readable_df = sqlContext.createDataFrame(readable_rdd, schema)
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
Sometimes RDD.map() functions can't use certain Python libraries because mappers get serialized and so you need to partition the data into enough partitions to occupy all the cores of the cluster and then use pyspark.RDD.mapPartition() to process an entire partition (just an Iterable of dicts) at a time. This enables you to instantiate an expensive object once - like a spaCy Language model - and apply it to one record at a time without recreating it.
def to_string_partition(partition:typing.Iterable[dict]) -> typing.Iterable[Row]:
"""Add a readable string form to an entire partition"""
# Instantiate expensive objects here
# Apply these objects' methods here
for record in partition:
record["readable"] = f'Word: {record["a"]} A: {record["b"]} B: {record["c"]}'
yield Row(**record)
readable_rdd = df.rdd.map(lambda x: x.asDict()).mapPartitions(to_string_partition)
print(readable_rdd.first())
# Row(a='hello', b=1, c=2, readable='Word: hello A: 1 B: 2')
# mapPartitions are more likely to require a specified schema
schema = T.StructType(
[
T.StructField("a", T.StringType(), True),
T.StructField("b", T.IntegerType(), True),
T.StructField("c", T.StringType(), True),
T.StructField("readable", T.StringType(), True),
]
)
# This is more reliable, you should use it in production!
readable_df = sqlContext.createDataFrame(readable_rdd, schema)
readable_df.show()
# +-------+---+---+--------------------+
# | a| b| c| readable|
# +-------+---+---+--------------------+
# | hello| 1| 2|Word: hello A: 1 ...|
# |goodbye| 2| 1|Word: goodbye A: ...|
# +-------+---+---+--------------------+
The DataFrame APIs are good because they allow SQL-like operations to be faster, but sometimes you need the power of direct Python without any limitations and it will greatly benefit your analytics practice to learn to employ RDDs. You can group records for example and then evaluate the entire group in RAM, just so long as it fits - which you can arrange by altering the partition key and limiting workers/increasing their RAM.
import numpy as np
def median_b(x):
"""Process a group and determine the median value"""
key = x[0]
values = x[1]
# Get the median value
m = np.median([record["b"] for record in values])
# Return a Row of the median for each group
return Row(**{"a": key, "median_b": m})
median_b_rdd = df.rdd.map(lambda x: x.asDict()).groupBy(lambda x: x["a"]).map(median_b)
median_b_rdd.first()
# Row(a='hello', median_b=1.0)
Below a useful code especially made to create any new column by simply calling a top-level business rule, completely isolated from the technical and heavy Spark's stuffs (no need to spend $ and to feel dependant of Databricks libraries anymore).
My advice is, in your organization try to do things simply and cleanly in life, for the benefits of top-level data users:
def createColumnFromRule(df, columnName, ruleClass, ruleName, inputColumns=None, inputValues=None, columnType=None):
from pyspark.sql import functions as F
from pyspark.sql import types as T
def _getSparkClassType(shortType):
defaultSparkClassType = "StringType"
typesMapping = {
"bigint" : "LongType",
"binary" : "BinaryType",
"boolean" : "BooleanType",
"byte" : "ByteType",
"date" : "DateType",
"decimal" : "DecimalType",
"double" : "DoubleType",
"float" : "FloatType",
"int" : "IntegerType",
"integer" : "IntegerType",
"long" : "LongType",
"numeric" : "NumericType",
"string" : defaultSparkClassType,
"timestamp" : "TimestampType"
}
sparkClassType = None
try:
sparkClassType = typesMapping[shortType]
except:
sparkClassType = defaultSparkClassType
return sparkClassType
if (columnType != None): sparkClassType = _getSparkClassType(columnType)
else: sparkClassType = "StringType"
aUdf = eval("F.udf(ruleClass." + ruleName + ", T." + sparkClassType + "())")
columns = None
values = None
if (inputColumns != None): columns = F.struct([df[column] for column in inputColumns])
if (inputValues != None): values = F.struct([F.lit(value) for value in inputValues])
# Call the rule
if (inputColumns != None and inputValues != None): df = df.withColumn(columnName, aUdf(columns, values))
elif (inputColumns != None): df = df.withColumn(columnName, aUdf(columns, F.lit(None)))
elif (inputValues != None): df = df.withColumn(columnName, aUdf(F.lit(None), values))
# Create a Null column otherwise
else:
if (columnType != None):
df = df.withColumn(columnName, F.lit(None).cast(columnType))
else:
df = df.withColumn(columnName, F.lit(None))
# Return the resulting dataframe
return df
Usage example:
# Define your business rule (you can get columns and values)
class CustomerRisk:
def churnRisk(self, columns=None, values=None):
isChurnRisk = False
# ... Rule implementation starts here
if (values != None):
if (values[0] == "FORCE_CHURN=true"): isChurnRisk = True
if (isChurnRisk == False and columns != None):
if (columns["AGE"]) <= 25): isChurnRisk = True
# ...
return isChurnRisk
# Execute the rule, it will create your new column in one line of code, that's all, easy isn't ?
# And look how to pass columns and values, it's really easy !
df = createColumnFromRule(df, columnName="CHURN_RISK", ruleClass=CustomerRisk(), ruleName="churnRisk", columnType="boolean", inputColumns=["NAME", "AGE", "ADDRESS"], inputValues=["FORCE_CHURN=true", "CHURN_RISK=100%"])