Python (pyspark) - Function decorator for UDF definition - python

I'm actually trying to define UDF with accumulator inside it. Accumulator is used to save exceptions from my_function for later. I come up with a definition of udf with some arguments (returnType, accumulator).
I'd like to make it a bit more readable and reusable. How could I define a decorator function with the code below ?
from pyspark.sql import functions as F
from pyspark.accumulators import AccumulatorParam
from pyspark.sql.types import StringType, StructField, IntegerType, StructType
from pyspark.sql import Row
data = [
Row(word="foo", number=7),
Row(word="bar", number=13)]
schema = StructType([
StructField("word", StringType(), True),
StructField("number", IntegerType(), True)])
df = spark.createDataFrame(data, schema)
Creation of my custom accumulator
class ListParam(AccumulatorParam):
def zero(self, v):
return []
def addInPlace(self, variable, value):
variable.append(value)
return variable
accum = spark.sparkContext.accumulator([], ListParam())
Definition of my udf
def accumulator_udf(accumulator, returnType):
def my_function(x):
y = None
try:
y = (x / (x - 7))
except Exception as e:
accumulator.add(dict([('errorType', str(e)), ('Data', x)]))
return y
return F.udf(my_function, returnType)
my_udf = accumulator_udf(accumulator=element_div_acc, returnType=IntegerType())
Result
df.select(my_udf(df.number)).show()
+---------------+
|div_one(number)|
+---------------+
| null|
| 2|
+---------------+
print(accum.value)
> [[{'errorType': 'integer division or modulo by zero', 'Data': 7}], []]
What I've tried
After few readings, I found this post (https://www.thecodeship.com/patterns/guide-to-python-function-decorators/) which helped but it'm stuck with a NameError
def accumulator_udf(accumulator, returnType):
def func_wrapper(func):
return F.udf(func, returnType)
return func_wrapper
accum = spark.sparkContext.accumulator([], ListParam())
#accumulator_udf(accumulator=accum , returnType=IntegerType())
def my_function(x):
y = None
try:
y = (x / (x - 7))
except Exception as e:
accumulator.add(dict([('errorType', str(e)), ('Data', x)]))
return y
df.select(my_function(df.number)).show()
When trying this implementation, I have the following error :
NameError: global name 'accumulator' is not defined
How could I manage to be able to access 'accumulator' ?
Thanks !

Related

Pyspark use sql.transform to nullify all empty strings in a column containing an array of structs

I have a column in a pyspark df that contains an array of maps like the below:
[{"address": "Fadden", "city": "", "country": "", "note": "", "stateProvince": "Queensland"}]
df.printSchema() returns the following for the column:
|-- constituencies: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- address: string (nullable = true)
| | |-- city: string (nullable = true)
| | |-- country: string (nullable = true)
| | |-- note: string (nullable = true)
| | |-- stateProvince: string (nullable = true)
And I want to nullify all those empty strings. So I thought this would be a perfect problem to solve with F.transform(col, f)
So I created the function, and then I use it in the transform expression like below:
def nullify_vals(d):
def nullify_string(str_):
if str_.strip() == "":
return None
return str_.strip()
return (
dict((k, nullify_string(v)) for k, v in d.items())
)
Note that the above works when tested on a dictionary:
dd = {"my": "map", "is": "", "not": " ", "entierly": " empty , right?"}
d_cln = nullify_vals(dd)
d_cln["not"] is None # returns True
But when I then use it in Pyspark, it gives me an error:
import pyspark.sql.functions as F
result = kyclean.select(F.transform("constituencies", nullify_vals))
TypeError: 'Column' object is not callable
These are the last lines of the stacktrace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File <command-899394298900126>:1, in <module>
----> 1 result = kyclean.select(F.transform("constituencies", nullify_vals))
File /databricks/spark/python/pyspark/sql/functions.py:4260, in transform(col, f)
4214 def transform(col, f):
4215 """
4216 Returns an array of elements after applying a transformation to each element in the input array.
4217
(...)
4258 +--------------+
4259 """
-> 4260 return _invoke_higher_order_function("ArrayTransform", [col], [f])
File /databricks/spark/python/pyspark/sql/functions.py:4209, in _invoke_higher_order_function(name, cols, funs)
4206 expr = getattr(expressions, name)
4208 jcols = [_to_java_column(col).expr() for col in cols]
-> 4209 jfuns = [_create_lambda(f) for f in funs]
4211 return Column(sc._jvm.Column(expr(*jcols + jfuns)))
Your function nullify_vals should take a Column object of type StructType as your array elements are structs. But you're passing a normal python objects.
Try rewriting it like this instead:
from pyspark.sql import functions as F, Column
def nullify_vals(struct_col: Column, fields: List[str]) -> Column:
for f in fields:
struct_col = struct_col.withField(
f,
F.when(F.trim(struct_col[f]) == "", None).otherwise(struct_col[f])
)
return struct_col
For each field in the inner struct, we use column withField method to update it, if it's equal to empty string then we set it to null.
Applied to your input example:
json_str = '{"constituencies":[{"address":"Fadden","city":"","country":"","note":"","stateProvince":"Queensland"}]}'
df = spark.read.json(spark.sparkContext.parallelize([json_str]))
You can get the list of constituencies struct fields from dataframe schema:
constituencies_fields = df.selectExpr("inline(constituencies)").columns
df1 = df.withColumn(
"constituencies",
F.transform("constituencies", lambda x: nullify_vals(x, constituencies_fields))
)
df1.show(truncate=False)
#+----------------------------------------+
#|constituencies |
#+----------------------------------------+
#|[{Fadden, null, null, null, Queensland}]|
#+----------------------------------------+
I'm still looking into the error you got and I'll update the post when I figure out what's wrong. In the meantime, you can do something like this to work around it
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
schema = ArrayType(
StructType([
StructField('address', StringType()),
StructField('city', StringType()),
StructField('country', StringType()),
StructField('note', StringType()),
StructField('stateProvince', StringType()),
]), True)
nullify_udf = udf(lambda arr: [[(v if v.strip() != "" else None) for v in area] for area in arr], schema)
result = kyclean.withColumn('constituencies', nullify_udf('constituencies'))
The specific error you got is saying that you can't call d.items() as a function and the input function really needs to be working on the Column object d that gets passed in.
The description of pyspark.sql.functions.transform says, "Returns an array of elements after applying a transformation to each element in the input array."
But inside the description of the accepted function, f, it says, "...and can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052)." So it can't take in custom Python UserDefinedFunctions yet, which is sort of what you were trying to do.

How to add multiple columns to pyspark DF using pandas_udf with multiple source columns?

And I need to extract from utc_timestamp its date and its hour into two different columns depending on time zone. Time zone name is defined by id from configuration const variable.
Input DF Output DF
+-------------+--+ +-------------+--+----------+----+
|utc_timestamp|id| |utc_timestamp|id|date |hour|
+-------------+--+ +-------------+--+----------+----|
|1608000000782|1 | |1608000000782|1 |2020-12-14|20 |
+-------------+--+ +-------------+--+----------+----+
|1608000240782|2 | |1608000240782|2 |2020-12-15|11 |
+-------------+--+ +-------------+--+----------+----+
I have pandas_udf that allows me to extract one column at a time and I have to create it twice:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import DateType, IntegerType
import pandas as pd
import pytz
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
class TimezoneUdfProvider(object):
def __init__(self):
self.extract_date_udf = pandas_udf(self._extract_date, DateType(), PandasUDFType.SCALAR)
self.extract_hour_udf = pandas_udf(self._extract_hour, IntegerType(), PandasUDFType.SCALAR)
def _extract_date(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_date(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def _extract_hour(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_hour(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def extract_date(utc_timestamp: int, id: str):
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).date()
def extract_hour(utc_timestamp: int, id: str) -> int:
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).hour
def extract_from_utc(df: DataFrame) -> DataFrame:
timezone_udf1 = TimezoneUdfProvider()
df_with_date = df.withColumn('date', timezone_udf1.extract_date_udf(f.col(utc_timestamp), f.col(id)))
timezone_udf2 = TimezoneUdfProvider()
df_with_hour = df_with_date.withColumn('hour', timezone_udf2.extract_hour_udf(f.col(utc_timestamp), f.col(id)))
return df_with_hour
Is there a better way to do it? Without a need to use the same udf provider twice?
you can do this without using udf using spark inbuilt functions.
We can use create_map to map the dictionary and create new timezone column , then convert using from_unixtime and from_utc_timestamp using the timezone as the newly mapped column. Once we have the timestamp as per the timezones, we can then fetch Hour and date feilds.
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
import pyspark.sql.functions as F
from itertools import chain
map_exp = F.create_map([F.lit(i) for i in chain(*TIMEZONE_LIST.items())])
final = (df.withColumn("TimeZone", map_exp.getItem(col("id")))
.withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()
(3) Spark Jobs
final:pyspark.sql.dataframe.DataFrame = [utc_timestamp: long, id: long ... 3 more fields]
+-------------+---+---------------+----------+----+
|utc_timestamp| id| TimeZone| date|Hour|
+-------------+---+---------------+----------+----+
|1608000000782| 1|America/Chicago|2020-12-14| 20|
|1608000240782| 2| Asia/Tokyo|2020-12-15| 11|
+-------------+---+---------------+----------+----+
EDIT: replacing create_map with a udf:
import pyspark.sql.functions as F
from pyspark.sql.functions import StringType
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
def fun(x):
return TIMEZONE_LIST.get(x,None)
map_udf = F.udf(fun,StringType())
final = (df.withColumn("TimeZone", map_udf("id")).withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()

Raise ‘PicklingError’error when apply functions in certain class with pyspark

I'm trying to use pandas functions in spark with applyInPandas,when I tranform it within a certain class,it raise errors like this:pickle.PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
my script run well in function-type coding:
from scipy.stats import kendalltau
import numpy as np
import pandas as pd
def kendall(dat, a, b):
kentmp = []
ken = [np.nan, np.nan]
if type(a) is list:
if dat.shape[0] > 3:
for item in a:
kentmp.append(kendalltau(dat[item], dat[b])[0])
tmp = pd.Series(kentmp, index=a).dropna()
if tmp.shape[0] > 0:
cato = tmp.idxmax()
if (tmp < 0).any():
cato = tmp.abs().idxmax()
ken = [cato, tmp[cato]]
index = ['category', 'corr']
else:
if dat.shape[0] >= 10:
ken = [kendalltau(dat[a], dat[b])[0], dat.shape[0]]
index = ['corr', 'N']
return pd.Series(ken, index=index)
def kendall_process(pdf):
result = pdf.groupby(['step_id','unit_id']).apply(kendall,'process','label')
result = pd.DataFrame(result).reset_index()
#result.columns = ['step_id','unit_id','corr','N']
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id','unit_id'])['label'].mean().reset_index()
result = pd.merge(result,result_,on=['step_id','unit_id'],how='left')
result.columns = ['step_id','unit_id','corr','N','ratio']
return result
result = datInOut.groupBy('step_id','unit_id').applyInPandas(kendall_process, schema='step_id string,\
unit_id string,\
corr float,\
N long,\
ratio float')
result.show(5)
+--------------+--------+-----------+----+-----+
| step_id| unit_id| corr| N|ratio|
+--------------+--------+-----------+----+-----+
|10303_A2AOI300|A2AOI300| null|null| 0.0|
|17613_A2AOI500|A2AOI500|-0.13477948| 14| 0.5|
|1B304_A2MAC100|A2MAC100| null|null| 1.0|
|1A106_A2SPR100|A2SPR100| null|null| 1.0|
|19103_A2AOI800|A2AOI800| null|null| 0.5|
+--------------+--------+-----------+----+-----+
only showing top 5 rows
but when I tansform it to class type coding,it raise the PicklingError:
#staticmethod
def kendall(dat,a,b):
kentmp=[]
ken=[np.nan,np.nan]
if type(a) is list:
if dat.shape[0]>3:
for item in a:
kentmp.append(kendalltau(dat[item],dat[b])[0])
tmp=pd.Series(kentmp,index=a).dropna()
if tmp.shape[0]>0:
cato=tmp.idxmax()
if (tmp<0).any():
cato=tmp.abs().idxmax()
ken=[cato,tmp[cato]]
index=['category','corr']
else:
if dat.shape[0]>=10:
ken=[kendalltau(dat[a],dat[b])[0],dat.shape[0]]
index=['corr','N']
return pd.Series(ken,index=index)
#staticmethod
def kendall_delay(pdf):
result = pdf.groupby(['step_id','equip_id']).apply(QTWorker.kendall,'delay','label')
result = pd.DataFrame(result).reset_index()
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id', 'equip_id'])['label'].mean().reset_index()
result = pd.merge(result, result_, on=['step_id', 'equip_id'], how='left')
result.columns = ['step_id', 'equip_id', 'corr', 'N', 'ratio']
return result
ret = datQ.groupBy(self.step, self.equip).applyInPandas(self.kendall_delay, schema='step_id string,equip_id string,corr float,N long,ratio float')
as see,I've already decorated the funtions used with staticmethod,but it still not work. I really wanna how to fix it!
Even I don't no why,but I've solved it by puting the kendall functions under kendall_delay.
I really wanna figure out the reason of it!
#staticmethod
def kendall_process(pdf):
def kendall(dat, a, b):
kentmp = []
ken = [np.nan, np.nan]
if type(a) is list:
if dat.shape[0] > 3:
for item in a:
kentmp.append(kendalltau(dat[item], dat[b])[0])
tmp = pd.Series(kentmp, index=a).dropna()
if tmp.shape[0] > 0:
cato = tmp.idxmax()
if (tmp < 0).any():
cato = tmp.abs().idxmax()
ken = [cato, tmp[cato]]
index = ['category', 'corr']
else:
if dat.shape[0] >= 10:
ken = [kendalltau(dat[a], dat[b])[0], dat.shape[0]]
index = ['corr', 'N']
return pd.Series(ken, index=index)
result = pdf.groupby(['step_id','equip_id']).apply(kendall,'process','label')
result = pd.DataFrame(result).reset_index()
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id', 'equip_id'])['label'].mean().reset_index()
result = pd.merge(result, result_, on=['step_id', 'equip_id'], how='left')
result.columns = ['step_id', 'equip_id', 'corr', 'N', 'ratio']
return result

Pyspark pass function as a parameter to UDF

I'm trying to create a UDF which takes another function as a parameter. But the execution ends up with an exception.
The code I run:
import pandas as pd
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.types import MapType, DataType, StringType
from pyspark.sql.functions import udf, struct, lit
import os
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
df_to_test = sqlContext.createDataFrame(
pd.DataFrame({
'inn': ['111', '222', '333'],
'field1': [1, 2, 3],
'field2': ['a', 'b', 'c']
}))
def foo_fun(row, b) -> str:
return 'a' + b()
def bar_fun():
return 'I am bar'
foo_fun_udf = udf(foo_fun, StringType())
df_to_test.withColumn(
'foo',
foo_fun_udf(struct([df_to_test[x] for x in df_to_test.columns]), bar_fun)
).show()
The exception:
Invalid argument, not a string or column: <function bar_fun at 0x7f0e69ce6268> of type <class 'function'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.
I tried to wrap bar_fun into udf with no success. Is there a way to pass function as a parameter?
You are not so far from the solution. Here is how I would do it :
def foo_fun_udf(func):
def foo_fun(row) -> str:
return 'a' + func()
out_udf = udf(foo_fun, StringType())
return out_udf
df_to_test.withColumn(
'foo',
foo_fun_udf(bar_fun)(struct([df_to_test[x] for x in df_to_test.columns]))
).show()

How to replace non ascii characters in pyspark dataframe [duplicate]

I need to delete accents from characters in Spanish and others languages from different datasets.
I already did a function based in the code provided in this post that removes special the accents. The problem is that the function is slow because it uses an UDF.
I'm just wondering if I can improve the performance of my function to get results in less time, because this is good for small dataframes but not for big ones.
Thanks in advance.
Here the code, you will be able to run it as it is presented:
# Importing sql types
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf, col
import unicodedata
# Building a simple dataframe:
schema = StructType([StructField("city", StringType(), True),
StructField("country", StringType(), True),
StructField("population", IntegerType(), True)])
countries = ['Venezuela', 'US#A', 'Brazil', 'Spain']
cities = ['Maracaibó', 'New York', ' São Paulo ', '~Madrid']
population = [37800000,19795791,12341418,6489162]
# Dataframe:
df = sqlContext.createDataFrame(list(zip(cities, countries, population)), schema=schema)
df.show()
class Test():
def __init__(self, df):
self.df = df
def clearAccents(self, columns):
"""This function deletes accents in strings column dataFrames,
it does not eliminate main characters, but only deletes special tildes.
:param columns String or a list of column names.
"""
# Filters all string columns in dataFrame
validCols = [c for (c, t) in filter(lambda t: t[1] == 'string', self.df.dtypes)]
# If None or [] is provided with column parameter:
if (columns == "*"): columns = validCols[:]
# Receives a string as an argument
def remove_accents(inputStr):
# first, normalize strings:
nfkdStr = unicodedata.normalize('NFKD', inputStr)
# Keep chars that has no other char combined (i.e. accents chars)
withOutAccents = u"".join([c for c in nfkdStr if not unicodedata.combining(c)])
return withOutAccents
function = udf(lambda x: remove_accents(x) if x != None else x, StringType())
exprs = [function(col(c)).alias(c) if (c in columns) and (c in validCols) else c for c in self.df.columns]
self.df = self.df.select(*exprs)
foo = Test(df)
foo.clearAccents(columns="*")
foo.df.show()
One possible improvement is to build a custom Transformer, which will handle Unicode normalization, and corresponding Python wrapper. It should reduce overall overhead of passing data between JVM and Python and doesn't require any modifications in Spark itself or access to private API.
On JVM side you'll need a transformer similar to this one:
package net.zero323.spark.ml.feature
import java.text.Normalizer
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{DataType, StringType}
class UnicodeNormalizer (override val uid: String)
extends UnaryTransformer[String, String, UnicodeNormalizer] {
def this() = this(Identifiable.randomUID("unicode_normalizer"))
private val forms = Map(
"NFC" -> Normalizer.Form.NFC, "NFD" -> Normalizer.Form.NFD,
"NFKC" -> Normalizer.Form.NFKC, "NFKD" -> Normalizer.Form.NFKD
)
val form: Param[String] = new Param(this, "form", "unicode form (one of NFC, NFD, NFKC, NFKD)",
ParamValidators.inArray(forms.keys.toArray))
def setN(value: String): this.type = set(form, value)
def getForm: String = $(form)
setDefault(form -> "NFKD")
override protected def createTransformFunc: String => String = {
val normalizerForm = forms($(form))
(s: String) => Normalizer.normalize(s, normalizerForm)
}
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == StringType, s"Input type must be string type but got $inputType.")
}
override protected def outputDataType: DataType = StringType
}
Corresponding build definition (adjust Spark and Scala versions to match your Spark deployment):
name := "unicode-normalization"
version := "1.0"
crossScalaVersions := Seq("2.11.12", "2.12.8")
organization := "net.zero323"
val sparkVersion = "2.4.0"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-mllib" % sparkVersion
)
On Python side you'll need a wrapper similar to this one.
from pyspark.ml.param.shared import *
# from pyspark.ml.util import keyword_only # in Spark < 2.0
from pyspark import keyword_only
from pyspark.ml.wrapper import JavaTransformer
class UnicodeNormalizer(JavaTransformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, form="NFKD", inputCol=None, outputCol=None):
super(UnicodeNormalizer, self).__init__()
self._java_obj = self._new_java_obj(
"net.zero323.spark.ml.feature.UnicodeNormalizer", self.uid)
self.form = Param(self, "form",
"unicode form (one of NFC, NFD, NFKC, NFKD)")
# kwargs = self.__init__._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, form="NFKD", inputCol=None, outputCol=None):
# kwargs = self.setParams._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
return self._set(**kwargs)
def setForm(self, value):
return self._set(form=value)
def getForm(self):
return self.getOrDefault(self.form)
Build Scala package:
sbt +package
include it when you start shell or submit. For example for Spark build with Scala 2.11:
bin/pyspark --jars path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar \
--driver-class-path path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar
and you should be ready to go. All what is left is a little bit of regexp magic:
from pyspark.sql.functions import regexp_replace
normalizer = UnicodeNormalizer(form="NFKD",
inputCol="text", outputCol="text_normalized")
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid")
]).toDF(["id", "text"])
(normalizer
.transform(df)
.select(regexp_replace("text_normalized", "\p{M}", ""))
.show())
## +--------------------------------------+
## |regexp_replace(text_normalized,\p{M},)|
## +--------------------------------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## +--------------------------------------+
Please note that this follows the same conventions as built in text transformers and is not null safe. You can easily correct for that by check for null in createTransformFunc.
Another way for doing using python Unicode Database :
import unicodedata
import sys
from pyspark.sql.functions import translate, regexp_replace
def make_trans():
matching_string = ""
replace_string = ""
for i in range(ord(" "), sys.maxunicode):
name = unicodedata.name(chr(i), "")
if "WITH" in name:
try:
base = unicodedata.lookup(name.split(" WITH")[0])
matching_string += chr(i)
replace_string += base
except KeyError:
pass
return matching_string, replace_string
def clean_text(c):
matching_string, replace_string = make_trans()
return translate(
regexp_replace(c, "\p{M}", ""),
matching_string, replace_string
).alias(c)
So now let's test it :
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid"),
(5, "São Paulo"), (6, "Maracaibó")
]).toDF(["id", "text"])
df.select(clean_text("text")).show()
## +---------------+
## | text|
## +---------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## | Sao Paulo|
## | Maracaibo|
## +---------------+
acknowledge #zero323
This solution is Python only, but is only useful if the number of possible accents is low (e.g. one single language like Spanish) and the character replacements are manually specified.
There seems to be no built-in way to do what you asked for directly without UDFs, however you can chain many regexp_replace calls to replace each possible accented character. I tested the performance of this solution and it turns out that it only runs faster if you have a very limited set of accents to replace. If that's the case it can be faster than UDFs because it is optimized outside of Python.
from pyspark.sql.functions import col, regexp_replace
accent_replacements_spanish = [
(u'á', 'a'), (u'Á', 'A'),
(u'é', 'e'), (u'É', 'E'),
(u'í', 'i'), (u'Í', 'I'),
(u'ò', 'o'), (u'Ó', 'O'),
(u'ú|ü', 'u'), (u'Ú|Ű', 'U'),
(u'ñ', 'n'),
# see http://stackoverflow.com/a/18123985/3810493 for other characters
# this will convert other non ASCII characters to a question mark:
('[^\x00-\x7F]', '?')
]
def remove_accents(column):
r = col(column)
for a, b in accent_replacements_spanish:
r = regexp_replace(r, a, b)
return r.alias('remove_accents(' + column + ')')
df = sqlContext.createDataFrame([['Olà'], ['Olé'], ['Núñez']], ['str'])
df.select(remove_accents('str')).show()
I haven't compared the performance with the other responses and this function is not as general, but it is at least worth considering because you don't need to add Scala or Java to your build process.
Here's my implementation.
Apart from accents I also remove speciach characters. Because I needed to pivot and save a table, and you can't save a table with column name that has " ,;{}()\n\t=\/" characters.
import re
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from unidecode import unidecode
spark = SparkSession.builder.getOrCreate()
data = [(1, " \\ / \\ {____} aŠdá_ \t = \n () asd ____aa 2134_ 23_"), (1, "N"), (2, "false"), (2, "1"), (3, "NULL"),
(3, None)]
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
df.show()
for col_name in ["txt"]:
tmp_dict = {}
for col_value in [row[0] for row in df.select(col_name).distinct().toLocalIterator()
if row[0] is not None]:
new_col_value = re.sub("[ ,;{}()\\n\\t=\\\/]", "_", col_value)
new_col_value = re.sub('_+', '_', new_col_value)
if new_col_value.startswith("_"):
new_col_value = new_col_value[1:]
if new_col_value.endswith("_"):
new_col_value = new_col_value[:-1]
new_col_value = unidecode(new_col_value)
tmp_dict[col_value] = new_col_value.lower()
df = df.na.replace(to_replace=tmp_dict, subset=[col_name])
df.show()
if you can't access external librares (like me) you can replace unidecode with
new_col_value = new_col_value.translate(str.maketrans(
"ä,ö,ü,ẞ,á,ä,č,ď,é,ě,í,ĺ,ľ,ň,ó,ô,ŕ,š,ť,ú,ů,ý,ž,Ä,Ö,Ü,ẞ,Á,Ä,Č,Ď,É,Ě,Í,Ĺ,Ľ,Ň,Ó,Ô,Ŕ,Š,Ť,Ú,Ů,Ý,Ž",
"a,o,u,s,a,a,c,d,e,e,i,l,l,n,o,o,r,s,t,u,u,y,z,A,O,U,S,A,A,C,D,E,E,I,L,L,N,O,O,R,S,T,U,U,Y,Z"))

Categories

Resources