I need to delete accents from characters in Spanish and others languages from different datasets.
I already did a function based in the code provided in this post that removes special the accents. The problem is that the function is slow because it uses an UDF.
I'm just wondering if I can improve the performance of my function to get results in less time, because this is good for small dataframes but not for big ones.
Thanks in advance.
Here the code, you will be able to run it as it is presented:
# Importing sql types
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf, col
import unicodedata
# Building a simple dataframe:
schema = StructType([StructField("city", StringType(), True),
StructField("country", StringType(), True),
StructField("population", IntegerType(), True)])
countries = ['Venezuela', 'US#A', 'Brazil', 'Spain']
cities = ['Maracaibó', 'New York', ' São Paulo ', '~Madrid']
population = [37800000,19795791,12341418,6489162]
# Dataframe:
df = sqlContext.createDataFrame(list(zip(cities, countries, population)), schema=schema)
df.show()
class Test():
def __init__(self, df):
self.df = df
def clearAccents(self, columns):
"""This function deletes accents in strings column dataFrames,
it does not eliminate main characters, but only deletes special tildes.
:param columns String or a list of column names.
"""
# Filters all string columns in dataFrame
validCols = [c for (c, t) in filter(lambda t: t[1] == 'string', self.df.dtypes)]
# If None or [] is provided with column parameter:
if (columns == "*"): columns = validCols[:]
# Receives a string as an argument
def remove_accents(inputStr):
# first, normalize strings:
nfkdStr = unicodedata.normalize('NFKD', inputStr)
# Keep chars that has no other char combined (i.e. accents chars)
withOutAccents = u"".join([c for c in nfkdStr if not unicodedata.combining(c)])
return withOutAccents
function = udf(lambda x: remove_accents(x) if x != None else x, StringType())
exprs = [function(col(c)).alias(c) if (c in columns) and (c in validCols) else c for c in self.df.columns]
self.df = self.df.select(*exprs)
foo = Test(df)
foo.clearAccents(columns="*")
foo.df.show()
One possible improvement is to build a custom Transformer, which will handle Unicode normalization, and corresponding Python wrapper. It should reduce overall overhead of passing data between JVM and Python and doesn't require any modifications in Spark itself or access to private API.
On JVM side you'll need a transformer similar to this one:
package net.zero323.spark.ml.feature
import java.text.Normalizer
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{DataType, StringType}
class UnicodeNormalizer (override val uid: String)
extends UnaryTransformer[String, String, UnicodeNormalizer] {
def this() = this(Identifiable.randomUID("unicode_normalizer"))
private val forms = Map(
"NFC" -> Normalizer.Form.NFC, "NFD" -> Normalizer.Form.NFD,
"NFKC" -> Normalizer.Form.NFKC, "NFKD" -> Normalizer.Form.NFKD
)
val form: Param[String] = new Param(this, "form", "unicode form (one of NFC, NFD, NFKC, NFKD)",
ParamValidators.inArray(forms.keys.toArray))
def setN(value: String): this.type = set(form, value)
def getForm: String = $(form)
setDefault(form -> "NFKD")
override protected def createTransformFunc: String => String = {
val normalizerForm = forms($(form))
(s: String) => Normalizer.normalize(s, normalizerForm)
}
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == StringType, s"Input type must be string type but got $inputType.")
}
override protected def outputDataType: DataType = StringType
}
Corresponding build definition (adjust Spark and Scala versions to match your Spark deployment):
name := "unicode-normalization"
version := "1.0"
crossScalaVersions := Seq("2.11.12", "2.12.8")
organization := "net.zero323"
val sparkVersion = "2.4.0"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-mllib" % sparkVersion
)
On Python side you'll need a wrapper similar to this one.
from pyspark.ml.param.shared import *
# from pyspark.ml.util import keyword_only # in Spark < 2.0
from pyspark import keyword_only
from pyspark.ml.wrapper import JavaTransformer
class UnicodeNormalizer(JavaTransformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, form="NFKD", inputCol=None, outputCol=None):
super(UnicodeNormalizer, self).__init__()
self._java_obj = self._new_java_obj(
"net.zero323.spark.ml.feature.UnicodeNormalizer", self.uid)
self.form = Param(self, "form",
"unicode form (one of NFC, NFD, NFKC, NFKD)")
# kwargs = self.__init__._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, form="NFKD", inputCol=None, outputCol=None):
# kwargs = self.setParams._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
return self._set(**kwargs)
def setForm(self, value):
return self._set(form=value)
def getForm(self):
return self.getOrDefault(self.form)
Build Scala package:
sbt +package
include it when you start shell or submit. For example for Spark build with Scala 2.11:
bin/pyspark --jars path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar \
--driver-class-path path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar
and you should be ready to go. All what is left is a little bit of regexp magic:
from pyspark.sql.functions import regexp_replace
normalizer = UnicodeNormalizer(form="NFKD",
inputCol="text", outputCol="text_normalized")
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid")
]).toDF(["id", "text"])
(normalizer
.transform(df)
.select(regexp_replace("text_normalized", "\p{M}", ""))
.show())
## +--------------------------------------+
## |regexp_replace(text_normalized,\p{M},)|
## +--------------------------------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## +--------------------------------------+
Please note that this follows the same conventions as built in text transformers and is not null safe. You can easily correct for that by check for null in createTransformFunc.
Another way for doing using python Unicode Database :
import unicodedata
import sys
from pyspark.sql.functions import translate, regexp_replace
def make_trans():
matching_string = ""
replace_string = ""
for i in range(ord(" "), sys.maxunicode):
name = unicodedata.name(chr(i), "")
if "WITH" in name:
try:
base = unicodedata.lookup(name.split(" WITH")[0])
matching_string += chr(i)
replace_string += base
except KeyError:
pass
return matching_string, replace_string
def clean_text(c):
matching_string, replace_string = make_trans()
return translate(
regexp_replace(c, "\p{M}", ""),
matching_string, replace_string
).alias(c)
So now let's test it :
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid"),
(5, "São Paulo"), (6, "Maracaibó")
]).toDF(["id", "text"])
df.select(clean_text("text")).show()
## +---------------+
## | text|
## +---------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## | Sao Paulo|
## | Maracaibo|
## +---------------+
acknowledge #zero323
This solution is Python only, but is only useful if the number of possible accents is low (e.g. one single language like Spanish) and the character replacements are manually specified.
There seems to be no built-in way to do what you asked for directly without UDFs, however you can chain many regexp_replace calls to replace each possible accented character. I tested the performance of this solution and it turns out that it only runs faster if you have a very limited set of accents to replace. If that's the case it can be faster than UDFs because it is optimized outside of Python.
from pyspark.sql.functions import col, regexp_replace
accent_replacements_spanish = [
(u'á', 'a'), (u'Á', 'A'),
(u'é', 'e'), (u'É', 'E'),
(u'í', 'i'), (u'Í', 'I'),
(u'ò', 'o'), (u'Ó', 'O'),
(u'ú|ü', 'u'), (u'Ú|Ű', 'U'),
(u'ñ', 'n'),
# see http://stackoverflow.com/a/18123985/3810493 for other characters
# this will convert other non ASCII characters to a question mark:
('[^\x00-\x7F]', '?')
]
def remove_accents(column):
r = col(column)
for a, b in accent_replacements_spanish:
r = regexp_replace(r, a, b)
return r.alias('remove_accents(' + column + ')')
df = sqlContext.createDataFrame([['Olà'], ['Olé'], ['Núñez']], ['str'])
df.select(remove_accents('str')).show()
I haven't compared the performance with the other responses and this function is not as general, but it is at least worth considering because you don't need to add Scala or Java to your build process.
Here's my implementation.
Apart from accents I also remove speciach characters. Because I needed to pivot and save a table, and you can't save a table with column name that has " ,;{}()\n\t=\/" characters.
import re
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from unidecode import unidecode
spark = SparkSession.builder.getOrCreate()
data = [(1, " \\ / \\ {____} aŠdá_ \t = \n () asd ____aa 2134_ 23_"), (1, "N"), (2, "false"), (2, "1"), (3, "NULL"),
(3, None)]
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
df.show()
for col_name in ["txt"]:
tmp_dict = {}
for col_value in [row[0] for row in df.select(col_name).distinct().toLocalIterator()
if row[0] is not None]:
new_col_value = re.sub("[ ,;{}()\\n\\t=\\\/]", "_", col_value)
new_col_value = re.sub('_+', '_', new_col_value)
if new_col_value.startswith("_"):
new_col_value = new_col_value[1:]
if new_col_value.endswith("_"):
new_col_value = new_col_value[:-1]
new_col_value = unidecode(new_col_value)
tmp_dict[col_value] = new_col_value.lower()
df = df.na.replace(to_replace=tmp_dict, subset=[col_name])
df.show()
if you can't access external librares (like me) you can replace unidecode with
new_col_value = new_col_value.translate(str.maketrans(
"ä,ö,ü,ẞ,á,ä,č,ď,é,ě,í,ĺ,ľ,ň,ó,ô,ŕ,š,ť,ú,ů,ý,ž,Ä,Ö,Ü,ẞ,Á,Ä,Č,Ď,É,Ě,Í,Ĺ,Ľ,Ň,Ó,Ô,Ŕ,Š,Ť,Ú,Ů,Ý,Ž",
"a,o,u,s,a,a,c,d,e,e,i,l,l,n,o,o,r,s,t,u,u,y,z,A,O,U,S,A,A,C,D,E,E,I,L,L,N,O,O,R,S,T,U,U,Y,Z"))
Related
I have a column in a pyspark df that contains an array of maps like the below:
[{"address": "Fadden", "city": "", "country": "", "note": "", "stateProvince": "Queensland"}]
df.printSchema() returns the following for the column:
|-- constituencies: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- address: string (nullable = true)
| | |-- city: string (nullable = true)
| | |-- country: string (nullable = true)
| | |-- note: string (nullable = true)
| | |-- stateProvince: string (nullable = true)
And I want to nullify all those empty strings. So I thought this would be a perfect problem to solve with F.transform(col, f)
So I created the function, and then I use it in the transform expression like below:
def nullify_vals(d):
def nullify_string(str_):
if str_.strip() == "":
return None
return str_.strip()
return (
dict((k, nullify_string(v)) for k, v in d.items())
)
Note that the above works when tested on a dictionary:
dd = {"my": "map", "is": "", "not": " ", "entierly": " empty , right?"}
d_cln = nullify_vals(dd)
d_cln["not"] is None # returns True
But when I then use it in Pyspark, it gives me an error:
import pyspark.sql.functions as F
result = kyclean.select(F.transform("constituencies", nullify_vals))
TypeError: 'Column' object is not callable
These are the last lines of the stacktrace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File <command-899394298900126>:1, in <module>
----> 1 result = kyclean.select(F.transform("constituencies", nullify_vals))
File /databricks/spark/python/pyspark/sql/functions.py:4260, in transform(col, f)
4214 def transform(col, f):
4215 """
4216 Returns an array of elements after applying a transformation to each element in the input array.
4217
(...)
4258 +--------------+
4259 """
-> 4260 return _invoke_higher_order_function("ArrayTransform", [col], [f])
File /databricks/spark/python/pyspark/sql/functions.py:4209, in _invoke_higher_order_function(name, cols, funs)
4206 expr = getattr(expressions, name)
4208 jcols = [_to_java_column(col).expr() for col in cols]
-> 4209 jfuns = [_create_lambda(f) for f in funs]
4211 return Column(sc._jvm.Column(expr(*jcols + jfuns)))
Your function nullify_vals should take a Column object of type StructType as your array elements are structs. But you're passing a normal python objects.
Try rewriting it like this instead:
from pyspark.sql import functions as F, Column
def nullify_vals(struct_col: Column, fields: List[str]) -> Column:
for f in fields:
struct_col = struct_col.withField(
f,
F.when(F.trim(struct_col[f]) == "", None).otherwise(struct_col[f])
)
return struct_col
For each field in the inner struct, we use column withField method to update it, if it's equal to empty string then we set it to null.
Applied to your input example:
json_str = '{"constituencies":[{"address":"Fadden","city":"","country":"","note":"","stateProvince":"Queensland"}]}'
df = spark.read.json(spark.sparkContext.parallelize([json_str]))
You can get the list of constituencies struct fields from dataframe schema:
constituencies_fields = df.selectExpr("inline(constituencies)").columns
df1 = df.withColumn(
"constituencies",
F.transform("constituencies", lambda x: nullify_vals(x, constituencies_fields))
)
df1.show(truncate=False)
#+----------------------------------------+
#|constituencies |
#+----------------------------------------+
#|[{Fadden, null, null, null, Queensland}]|
#+----------------------------------------+
I'm still looking into the error you got and I'll update the post when I figure out what's wrong. In the meantime, you can do something like this to work around it
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
schema = ArrayType(
StructType([
StructField('address', StringType()),
StructField('city', StringType()),
StructField('country', StringType()),
StructField('note', StringType()),
StructField('stateProvince', StringType()),
]), True)
nullify_udf = udf(lambda arr: [[(v if v.strip() != "" else None) for v in area] for area in arr], schema)
result = kyclean.withColumn('constituencies', nullify_udf('constituencies'))
The specific error you got is saying that you can't call d.items() as a function and the input function really needs to be working on the Column object d that gets passed in.
The description of pyspark.sql.functions.transform says, "Returns an array of elements after applying a transformation to each element in the input array."
But inside the description of the accepted function, f, it says, "...and can use methods of Column, functions defined in pyspark.sql.functions and Scala UserDefinedFunctions. Python UserDefinedFunctions are not supported (SPARK-27052)." So it can't take in custom Python UserDefinedFunctions yet, which is sort of what you were trying to do.
Pardon my ignorance, I am new to pyspark. I'm trying to improve a udf to create a new column count_adj based on values from another column a_type using a dictionary. How do I account for None / Null types in this process to create my new column. This is super easy in pandas (df['adj_count'] = df.a_type.map(count_map)) but struggling do this in pyspark.
Sample data / imports:
# all imports used -- not just for this portion of the script
from pyspark.sql import SparkSession, HiveContext, SQLContext
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark import sql
import pyspark.sql.functions as F
import random
from pyspark.sql.functions import lit
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from datetime import datetime
from datetime import date
from datetime import timedelta
from pyspark.sql import Window
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import rank, row_number, max as max_, col
import sys
import os
spark = SparkSession.builder.appName('a_type_tests').getOrCreate()
# note: sample data has results from the original udf for comparison
dataDictionary = [
(26551, 491, '2022-01-22', '740', -1, 16),
(24192, 338, '2022-01-22', '740', -1, 16),
(26555, 3013, '2022-01-22', '740', -1, 16),
(26571, 937, '2022-01-22', '740', -1, 16),
(24376, 371, '2022-01-22', '740', -1, 16),
(17716, 118, '2022-01-22', '740', -1, 16),
(26554, 3013, '2022-01-22', '740', -1, 16),
(26734, 105, '2022-01-22', '740', -1, 16),
(26051, 415, '2022-01-22', '600', -1, 8),
(26602, 501, '2022-01-22', '740', -1, 16),
(26125, 501, '2022-01-22', None, -1, 0)
]
sdf = spark.createDataFrame(data=dataDictionary, schema = ['id', 'loc_id', 'a_date', 'a_type', 'adj_val', 'udf_original'])
sdf.printSchema()
sdf.show(truncate=False)
The original udf is similar to:
def count_adj(a_type):
if a_type is None:
return 0
elif a_type in ('703','704','705','708','900','910'):
return 4
elif a_type in ('701','702'):
return 2
elif a_type in ('711','712'):
return 1
elif a_type in ('600', '704'):
return 8
elif a_type in ('740'):
return 16
elif a_type in ('305','306'):
return 32
elif a_type in ('601','612','615'):
return 64
else:
return 128
I've created a dictionary to correspond to these values.
# remove 0:None type pairing because None is not iterable to invert dict
count_map = {1:['711','712'], \
2:['701','702'], \
4:['703','704','705','708','900','910'], \
8:['600', '704'], \
16:['740'], \
32:['305','306'], \
64:['601','612','615'], \
128: ['1600', '1601', '1602']
}
# invert dict
count_map = {c:key for key, vals in count_map.items() for c in vals}
# create None mapping manually
count_map[None] = 0
Searching SO I came across this which resulted in in the following error:
# Code Tried:
# Changes None type to NULL -- fine but how do I account for these None/Null Values in my dict?
map_expr = F.create_map([lit(x) for x in chain(*count_map.items())])
sdf2 = sdf.withColumn('count_adj', map_expr.getItem('a_type'))
# or:
sdf2 = sdf.withColumn('count_adj',map_expr[col('a_type')]).show()
# Error
Py4JJavaError: An error occurred while calling o334.showString.
: java.lang.RuntimeException: Cannot use null as map key.
How do I account for None / NULL types when using a dictionary to create a new column based on values from another column? Is it possible to include a NULL check in my map expression or something else entirely?
For completeness, I removed the None type from the dictionary and utilized a method similar to Karthik's answer and a combo of other SO posts mentioned in the question.
My final solution relied on the code below and using .when() and .isNull() to account for None / NULL conversions.
# Original Mapping
# remove 0:None type pairing because None is not iterable to invert dict
count_map = {1:['711','712'], \
2:['701','702'], \
4:['703','704','705','708','900','910'], \
8:['600', '704'], \
16:['740'], \
32:['305','306'], \
64:['601','612','615'], \
128: ['1600', '1601', '1602']
}
# invert dict
count_map = {c:key for key, vals in count_map.items() for c in vals}
# New below:
map_expr = F.create_map([lit(x) for x in chain(*count_map.items())])
sdf2 = sdf.withColumn('count_adj', F.when( col('a_type').isNull(), 0 ).otherwise( map_expr.getItem(col('a_type') ) ) )
The key columns must all have the same data type, and can't be null. The key of the map won’t accept None/Null values.
Instead of above code, you can use when function, which gives your desired output as shown below:
newDF = sdf.withColumn("count_adj",F.when(F.col("a_type").isNull(),0)\
.when(F.col("a_type").isin('711','712'),1)\
.when(F.col("a_type").isin('701','702'),2)\
.when(F.col("a_type").isin('703','704','705','708','900','910'),4)\
.when(F.col("a_type").isin('600', '704'),8)\
.when(F.col("a_type").isin('740'),16)\
.when(F.col("a_type").isin('305','306'),32)\
.when(F.col("a_type").isin('601','612','615'),64)\
.otherwise(128))
And I need to extract from utc_timestamp its date and its hour into two different columns depending on time zone. Time zone name is defined by id from configuration const variable.
Input DF Output DF
+-------------+--+ +-------------+--+----------+----+
|utc_timestamp|id| |utc_timestamp|id|date |hour|
+-------------+--+ +-------------+--+----------+----|
|1608000000782|1 | |1608000000782|1 |2020-12-14|20 |
+-------------+--+ +-------------+--+----------+----+
|1608000240782|2 | |1608000240782|2 |2020-12-15|11 |
+-------------+--+ +-------------+--+----------+----+
I have pandas_udf that allows me to extract one column at a time and I have to create it twice:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import DateType, IntegerType
import pandas as pd
import pytz
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
class TimezoneUdfProvider(object):
def __init__(self):
self.extract_date_udf = pandas_udf(self._extract_date, DateType(), PandasUDFType.SCALAR)
self.extract_hour_udf = pandas_udf(self._extract_hour, IntegerType(), PandasUDFType.SCALAR)
def _extract_date(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_date(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def _extract_hour(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_hour(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def extract_date(utc_timestamp: int, id: str):
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).date()
def extract_hour(utc_timestamp: int, id: str) -> int:
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).hour
def extract_from_utc(df: DataFrame) -> DataFrame:
timezone_udf1 = TimezoneUdfProvider()
df_with_date = df.withColumn('date', timezone_udf1.extract_date_udf(f.col(utc_timestamp), f.col(id)))
timezone_udf2 = TimezoneUdfProvider()
df_with_hour = df_with_date.withColumn('hour', timezone_udf2.extract_hour_udf(f.col(utc_timestamp), f.col(id)))
return df_with_hour
Is there a better way to do it? Without a need to use the same udf provider twice?
you can do this without using udf using spark inbuilt functions.
We can use create_map to map the dictionary and create new timezone column , then convert using from_unixtime and from_utc_timestamp using the timezone as the newly mapped column. Once we have the timestamp as per the timezones, we can then fetch Hour and date feilds.
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
import pyspark.sql.functions as F
from itertools import chain
map_exp = F.create_map([F.lit(i) for i in chain(*TIMEZONE_LIST.items())])
final = (df.withColumn("TimeZone", map_exp.getItem(col("id")))
.withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()
(3) Spark Jobs
final:pyspark.sql.dataframe.DataFrame = [utc_timestamp: long, id: long ... 3 more fields]
+-------------+---+---------------+----------+----+
|utc_timestamp| id| TimeZone| date|Hour|
+-------------+---+---------------+----------+----+
|1608000000782| 1|America/Chicago|2020-12-14| 20|
|1608000240782| 2| Asia/Tokyo|2020-12-15| 11|
+-------------+---+---------------+----------+----+
EDIT: replacing create_map with a udf:
import pyspark.sql.functions as F
from pyspark.sql.functions import StringType
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
def fun(x):
return TIMEZONE_LIST.get(x,None)
map_udf = F.udf(fun,StringType())
final = (df.withColumn("TimeZone", map_udf("id")).withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()
I do RSS parsing and get news from news headlines
def print_headlines_test(rss_dict):
for key,url in rss_dict.items():
feed = feedparser.parse(url)
headlines = []
allheadlines = []
for newsitem in feed['items']:
headlines.append(newsitem['title'])
for key,url in rss_dict.items():
allheadlines.extend(headlines)
Then i'm saving this to csv and read df:
def write_and_read():
header = ['Tittle' , 'Desc']
with open('news.csv', 'w', encoding='utf-8-sig') as csvfile:
writer.writerow(i for i in header)
for a in zip(allheadlines):
writer.writerow((a))
df = pd.read_csv('news.csv')
Then i'm searching news by certain targets (t & t2):
t = 'word1|word2|word3'
t2 = 'word3|word4|word5'
And making dataframe of this:
def certain_words(t, t2):
result = df.apply(lambda x: x.str.contains(t, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
result2 = df.apply(lambda x: x.str.contains(t2, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
df[result&result2]
So, my input values is rss_dict (dictionary of rss with format {'rss-name':'rss-link'} and two targets (t,t2)
Now my question. How i should combine all of this functions to something (function or maybe class) in order to set these three values (rss_dict, t, t2) and so that my code runs immediately?
You can use a class with all these functions included inside as follows:
class News:
def __init__(self,rss_dict,t, t2):
rss_dict=self.rss_dict
t=self.t
t2=self.t2
def print_headlines_test(self):
for key,url in self.rss_dict.items():
feed = feedparser.parse(url)
headlines = []
allheadlines = []
for newsitem in feed['items']:
headlines.append(newsitem['title'])
for key,url in self.rss_dict.items():
allheadlines.extend(headlines)
self.allheadlines=allheadlines
def write_and_read(self):
header = ['Tittle' , 'Desc']
with open('news.csv', 'w', encoding='utf-8-sig') as csvfile:
writer.writerow(i for i in header)
for a in zip(self.allheadlines):
writer.writerow((a))
df = pd.read_csv('news.csv')
def certain_words(self):
result = df.apply(lambda x: x.str.contains(self.t, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
result2 = df.apply(lambda x: x.str.contains(self.t2, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
df[result&result2]
You should pass the parameters by creating an object of the class and the functions has to be called using the created object for them to run
I'm actually trying to define UDF with accumulator inside it. Accumulator is used to save exceptions from my_function for later. I come up with a definition of udf with some arguments (returnType, accumulator).
I'd like to make it a bit more readable and reusable. How could I define a decorator function with the code below ?
from pyspark.sql import functions as F
from pyspark.accumulators import AccumulatorParam
from pyspark.sql.types import StringType, StructField, IntegerType, StructType
from pyspark.sql import Row
data = [
Row(word="foo", number=7),
Row(word="bar", number=13)]
schema = StructType([
StructField("word", StringType(), True),
StructField("number", IntegerType(), True)])
df = spark.createDataFrame(data, schema)
Creation of my custom accumulator
class ListParam(AccumulatorParam):
def zero(self, v):
return []
def addInPlace(self, variable, value):
variable.append(value)
return variable
accum = spark.sparkContext.accumulator([], ListParam())
Definition of my udf
def accumulator_udf(accumulator, returnType):
def my_function(x):
y = None
try:
y = (x / (x - 7))
except Exception as e:
accumulator.add(dict([('errorType', str(e)), ('Data', x)]))
return y
return F.udf(my_function, returnType)
my_udf = accumulator_udf(accumulator=element_div_acc, returnType=IntegerType())
Result
df.select(my_udf(df.number)).show()
+---------------+
|div_one(number)|
+---------------+
| null|
| 2|
+---------------+
print(accum.value)
> [[{'errorType': 'integer division or modulo by zero', 'Data': 7}], []]
What I've tried
After few readings, I found this post (https://www.thecodeship.com/patterns/guide-to-python-function-decorators/) which helped but it'm stuck with a NameError
def accumulator_udf(accumulator, returnType):
def func_wrapper(func):
return F.udf(func, returnType)
return func_wrapper
accum = spark.sparkContext.accumulator([], ListParam())
#accumulator_udf(accumulator=accum , returnType=IntegerType())
def my_function(x):
y = None
try:
y = (x / (x - 7))
except Exception as e:
accumulator.add(dict([('errorType', str(e)), ('Data', x)]))
return y
df.select(my_function(df.number)).show()
When trying this implementation, I have the following error :
NameError: global name 'accumulator' is not defined
How could I manage to be able to access 'accumulator' ?
Thanks !