pyspark compare column values with another column contains range of values - python

i want to compare values of one column with another column having range of reference values.
i have tried using the following code:
from pyspark.sql.functions import udf, size
from pyspark.sql.types import *
df1 = sc.parallelize([([1], [1, 2, 3]), ([2], [4, 5, 6,7])]).toDF(["value", "Reference_value"])
intersect = lambda type: (udf(
lambda x, y: (
list(set(x) & set(y)) if x is not None and y is not None else None),
ArrayType(type)))
integer_intersect = intersect(IntegerType())
# df1.select(
# integer_intersect("value", "Reference_value"),
# size(integer_intersect("value", "Reference_value"))).show()
df1=df1.where(size(integer_intersect("value", "Reference_value")) > 0)
df1.show()
The above code works if we create dataframe like following:
because the value and refernce_value columns are array_type with long_type
but if i am reading dataframe with csv then i am not able to convert to array type. here df1 is read from CSV
df1 is as follows df1=
category value Reference value
count 1 1
n_timer n20 n40,n20
frames 54 56
timer n8 n3,n6,n7
pdf FALSE TRUE
zip FALSE FALSE
I want to compare "value" column with "Reference_value" column and to derive two new dataframes where one data frame is to filter rows if value column is not in the set of Reference value.
Output df2=
category value Reference value
count 1 1
n_timer n20 n40,n20
zip FALSE FALSE
output df3=
category value Reference value
frames 54 56
timer n8 n3,n6,n7
pdf FALSE TRUE
is there any easier way like array_contains. I tried Array_contains either but not working
from pyspark.sql.functions import array_contains
df.where(array_contains("Reference_value", df1["vale"]))

#One can copy paste the below code for direct input and outputs
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, size
from pyspark.sql.types import *
from pyspark.sql.functions import split
sc = SparkContext.getOrCreate()
sqlContext = SQLContext.getOrCreate(sc)
df1 = sc.parallelize([("count","1","1"), ("n_timer","n20","n40,n20"), ("frames","54","56"),("timer","n8","n3,n6,n7"),("pdf","FALSE","TRUE"),("zip","FALSE","FALSE")]).toDF(["category", "value","Reference_value"])
print(df1.show())
df1=df1.withColumn("Reference_value", split("Reference_value", ",\s*").cast("array<string>"))
df1=df1.withColumn("value", split("value", ",\s*").cast("array<string>"))
intersect = lambda type: (udf(
lambda x, y: (
list(set(x) & set(y)) if x is not None and y is not None else None),
ArrayType(type)))
string_intersect = intersect(StringType())
df2=df1.where(size(string_intersect("value", "Reference_value")) > 0)
df3=df1.where(size(string_intersect("value", "Reference_value")) <= 0)
print(df2.show())
print(df3.show())
input df1=
+--------+-----+---------------+
|category|value|Reference_value|
+--------+-----+---------------+
| count| 1| 1|
| n_timer| n20| n40,n20|
| frames| 54| 56|
| timer| n8| n3,n6,n7|
| pdf|FALSE| TRUE|
| zip|FALSE| FALSE|
+--------+-----+---------------+
df2=
+--------+-------+---------------+
|category| value|Reference_value|
+--------+-------+---------------+
| count| [1]| [1]|
| n_timer| [n20]| [n40, n20]|
| zip|[FALSE]| [FALSE]|
+--------+-------+---------------+
df3=
+--------+-------+---------------+
|category| value|Reference_value|
+--------+-------+---------------+
| frames| [54]| [56]|
| timer| [n8]| [n3, n6, n7]|
| pdf|[FALSE]| [TRUE]|
+--------+-------+---------------+

Related

Counting nulls or zeros in PySpark data frame with struct column types

I have a PySpark data frame that has a mix of integer columns, string columns, and also struct columns. A struct column could be a struct, but it could also just be null. For example:
id | mystring | mystruct |
--------------------------
1 | something | <struct>|
2 | something | null |
3 | 0 | null |
4 | something | null |
5 | something | <struct> |
Is there any easy way to go through the entire data frame and get the count of null/na/0 values without having to explode the struct columns? For example, I would want for above
id | mystring | mystruct |
--------------------------
0 | 1 | 3
I've seen a few different methods but they always seem to throw an error with the struct types, and I'd rather not have to do them separately.
Not exactly an easy way, but you could define a function to handle the nulls (all input types) and nans/zeros (for numeric inputs) for each column. Then you can join the results for each column separately.
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id
conf = SparkConf()
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
# setup
data = [[1, {'f':[1,2,3]}], [2, None],[0, None], [1, None], [3, {'f':[1]}]]
schema = StructType([
StructField('mynum', IntegerType(), True),
StructField('mystruct',
StructType([StructField('f', ArrayType(IntegerType()), True)]), True)
])
rdd = spark.sparkContext.parallelize(data)
df = spark.createDataFrame(rdd, schema)
def get_nulls_nans_zeros(c, df):
# all inputs
val = df.select(count(when(isnull(c), c)).alias(c))
t = type(df.schema[c].dataType)
# numeric inputs
if t in [ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, DecimalType]:
val = val.union(df.select(count(when(isnan(c), c)).alias(c)))
val = val.union(df.select(count(when(col(c) == 0, c)).alias(c)))
return val.select(sum(c).alias(c))
# Get and merge results for each column
res = [get_nulls_nans_zeros(c, df) for c in df.columns]
res = [r.withColumn("id", monotonically_increasing_id()) for r in res]
result = res[0].join(res[1], "id", "outer").drop("id")
result.show()
If you're using Spark 3.1+, you can also use the allowMissingColumns flag in unionByName to do the last part instead of having to join via a monotonically increasing id.

How to obtain the most repeat value in Array Type in Pyspark?

I have a pyspark data frame like the follow:
columns = ["id","values"]
data = [("sample1", ["a","b","a"]), ("sample2", ["b","b","a","c"])]
dataframe = spark.sparkContext.parallelize(data)
source
+-------+--------------------+
| id| values|
+-------+--------------------+
|sample1| ["a","b","a"]|
|sample2| ["b","b","a","c"]|
+-------+--------------------+
I would like build a column with the most common value in the array and obtain a dataframe like the follow:
+-------+--------------------+---------+
| id| values| common|
+-------+--------------------+---------+
|sample1| ["a","b","a"]| "a"|
|sample2| ["b","b","a","c"]| "b"|
+-------+--------------------+---------+
You can explode the array values the group by to count occurences of each value and use Window to filter the value with max count :
from pyspark.sql import Window
import pyspark.sql.functions as F
df1 = df.withColumn(
"common",
F.explode("values")
).groupBy("id", "values", "common").count().withColumn(
"rn",
F.row_number().over(Window.partitionBy("id", "values").orderBy(F.col("count").desc()))
).filter("rn = 1").drop("rn", "count")
df1.show()
#+-------+------------+------+
#|id |values |common|
#+-------+------------+------+
#|sample1|[a, b, a] |a |
#|sample2|[b, b, a, c]|b |
#+-------+------------+------+
Another way without using explode is to do it with higher-order functions transform and filter along with some array functions:
df1 = df.withColumn(
"common",
F.array_max(
F.expr("""transform(
array_distinct(values),
x -> struct(
size(filter(values, y -> y = x)) as count,
x as value
)
)""")
)["value"]
)

Add column with closest vaues to PySpark Dataframe

I have a PySpark dataframe (say df) which has two columns ( Name and Score). Following is an example of the dataframe:
+------+-----+
| Name|Score|
+------+-----+
| name1|11.23|
| name2|14.57|
| name3| 2.21|
| name4| 8.76|
| name5|18.71|
+------+-----+
I have a numpy array (say bin_array) which has values close to the numerical values that are there in the column titled Score of the PySpark dataframe.
Following is the aforementioned numpy array:
bin_array = np.array([0, 5, 10, 15, 20])
I want to compare value from each row of the column Score with values in bin_array and store the closest value (gotten from bin_array) in a separate column in the PySpark dataframe.
Below is how I would like my new dataframe (say df_new) to look.
+------+-----+------------+
| Name|Score| Closest_bin|
+------+-----+------------+
| name1|11.23| 10.0 |
| name2|14.57| 15.0 |
| name3| 2.21| 0.0 |
| name4| 8.76| 10.0 |
| name5|18.71| 20.0 |
+------+-----+------------+
I have the below mentioned function which gives me the closest values from bin_array. The function works fine when I test it with individual numbers.
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return float(array[idx])
In my actual work, I will have millions of rows in the datafrmae. What is the most efficient way to create df_new?
Following are the steps that I tried to use to create user-defined function (udf) and the new data frame (df_new).
closest_bin_udf = F.udf( lambda x: find_nearest(array, x) )
df_new = df.withColumn( 'Closest_bin' , closest_bin_udf(df.Score) )
But, I got errors when I tried df_new.show(). A portion of the error is shown below.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-685c9b7e25d9> in <module>()
----> 1 df_new.show()
/usr/lib/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
376 """
377 if isinstance(truncate, bool) and truncate:
--> 378 print(self._jdf.showString(n, 20, vertical))
379 else:
380 print(self._jdf.showString(n, int(truncate), vertical))
You can use the below mentioned steps to create the aforementioned dataframe:
from pyspark.sql import *
import pyspark.sql.functions as F
import numpy as np
Stats = Row("Name", "Score")
stat1 = Stats('name1', 11.23)
stat2 = Stats('name2', 14.57)
stat3 = Stats('name3', 2.21)
stat4 = Stats('name4', 8.76)
stat5 = Stats('name5', 18.71)
stat_lst = [stat1 , stat2, stat3, stat4, stat5]
df = spark.createDataFrame(stat_lst)
df.show()
You can use a bucketizer from pyspark.mllib
from pyspark.sql import *
import pyspark.sql.functions as F
import numpy as np
Stats = Row("Name", "Score")
stat_lst = [Stats('name1', 11.23) , Stats('name2', 14.57), Stats('name3', 2.21), Stats('name4', 8.76), Stats('name5', 18.71)]
df = spark.createDataFrame(stat_lst)
from pyspark.ml.feature import Bucketizer
"""
Bucketizer creates bins like 0-5:0, 5-10:1, 10-15:2, 15-20:3
As I see, your expected output wants the closest numbered bin, so you might
have to change your buckets or the variable `t` below accordingly.
"""
bucket_list = [0, 5, 10, 15, 20]
bucketizer = Bucketizer(splits=bucket_list, inputCol="Score", outputCol="buckets")
df_buck = bucketizer.setHandleInvalid("keep").transform(df)
df_buck.show()
I am still working on getting the closest bin, I'll update my answer.
If you want your array values for each bucket you can use udf to create a new column with bucket names
from pyspark.sql.functions import udf
from pyspark.sql.types import *
t = dict(zip(range(len(bucket_list)), bucket_list))
udf_foo = udf(lambda x: t[x], IntegerType())
df_buck = df_buck.withColumn("score_bucket", udf_foo("buckets"))
Output
>>> df_buck.show()
+-----+-----+-------+------------+
| Name|Score|buckets|score_bucket|
+-----+-----+-------+------------+
|name1|11.23| 2.0| 10|
|name2|14.57| 2.0| 10|
|name3| 2.21| 0.0| 0|
|name4| 8.76| 1.0| 5|
|name5|18.71| 3.0| 15|
+-----+-----+-------+------------+
EDIT: Correcting the score buckets:
# Not dynamic, but please try to figure out this business logic according to your use-case
df_buck = df_buck.withColumn("correct_buckets", F.when(df_buck.Score-df_buck.score_bucket > 5/2, F.col("score_bucket") + 5).otherwise(F.col("score_bucket"))).drop("buckets", "score_bucket")
Now output is as expected:
+-----+-----+---------------+
| Name|Score|correct_buckets|
+-----+-----+---------------+
|name1|11.23| 10|
|name2|14.57| 15|
|name3| 2.21| 0|
|name4| 8.76| 10|
|name5|18.71| 20|
+-----+-----+---------------+
You can also pandas_udf although I'd suggest you test out the speed and memory consumption as you scale up
from pyspark.sql.functions import pandas_udf, PandasUDFType
import numpy as np
import pandas as pd
df = spark.createDataFrame(zip(["name_"+str(i) for i in range(1,6)], [11.23, 14.57, 2.21, 8.76, 18.71]), ["Name", "Score"])
bin_array = np.array([0, 5, 10, 15, 20])
#pandas_udf('double', PandasUDFType.SCALAR)
def find_nearest(value):
res = bin_array[np.newaxis, :] - value.values[:, np.newaxis]
ret_vals = [bin_array[np.argmin(np.abs(i))] for i in res]
return pd.Series(ret_vals)
df.withColumn('v2', find_nearest(df.Score)).show()
Output
+------+-----+----+
| Name|Score| v2|
+------+-----+----+
|name_1|11.23|10.0|
|name_2|14.57|15.0|
|name_3| 2.21| 0.0|
|name_4| 8.76|10.0|
|name_5|18.71|20.0|
+------+-----+----+

How do I convert convert a unicode list contained in pyspark column of a dataframe into float list?

I have created a dataframe as shown
import ast
from pyspark.sql.functions import udf
values = [(u'['2','4','713',10),(u'['12','245']',20),(u'['101','12']',30)]
df = sqlContext.createDataFrame(values,['list','A'])
df.show()
+-----------------+---+
| list| A|
+-----------------+---+
|u'['2','4','713']| 10|
| u' ['12','245']| 20|
| u'['101','12',]| 30|
+-----------------+---+
**How can I convert the above dataframe such that each element in the list is a float and is within a proper list**
I tried the below one :
def df_amp_conversion(df_modelamp):
string_list_to_list = udf(lambda row: ast.literal_eval(str(row)))
df_modelamp = df_modelamp.withColumn('float_list',string_list_to_list(col("list")))
df2 = amp_conversion(df)
But the data remains the same without a change.
I dont want convert the dataframe to pandas or use collect as it is memory intensive.
And if possible try to give me an optimal solution.I am using pyspark
That's because you forgot about the type
udf(lambda row: ast.literal_eval(str(row)), "array<integer>")
Though something like this would be more efficient:
from pyspark.sql.functions import rtrim, ltrim, split
df = spark.createDataFrame(["""u'[23,4,77,890,4]"""], "string").toDF("list")
df.select(split(
regexp_replace("list", "^u'\\[|\\]$", ""), ","
).cast("array<integer>").alias("list")).show()
# +-------------------+
# | list|
# +-------------------+
# |[23, 4, 77, 890, 4]|
# +-------------------+
I can create the true result in python 3 with a little change in definition of function df_amp_conversion. You didn't return the value of df_modelamp! This code works for me properly:
import ast
from pyspark.sql.functions import udf, col
values = [(u"['2','4','713']",10),(u"['12','245']",20),(u"['101','12']",30)]
df = sqlContext.createDataFrame(values,['list','A'])
def df_amp_conversion(df_modelamp):
string_list_to_list = udf(lambda row: ast.literal_eval(str(row)))
df_modelamp = df_modelamp.withColumn('float_list',string_list_to_list(col("list")))
return df_modelamp
df2 = df_amp_conversion(df)
df2.show()
# +---------------+---+-----------+
# | list| A| float_list|
# +---------------+---+-----------+
# |['2','4','713']| 10|[2, 4, 713]|
# | ['12','245']| 20| [12, 245]|
# | ['101','12']| 30| [101, 12]|
# +---------------+---+-----------+

map values in a dataframe from a dictionary using pyspark

I want to know how to map values in a specific column in a dataframe.
I have a dataframe which looks like:
df = sc.parallelize([('india','japan'),('usa','uruguay')]).toDF(['col1','col2'])
+-----+-------+
| col1| col2|
+-----+-------+
|india| japan|
| usa|uruguay|
+-----+-------+
I have a dictionary from where I want to map the values.
dicts = sc.parallelize([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')])
The output I want is:
+-----+-------+--------+--------+
| col1| col2|col1_map|col2_map|
+-----+-------+--------+--------+
|india| japan| ind| jpn|
| usa|uruguay| us| urg|
+-----+-------+--------+--------+
I have tried using the lookup function but it doesn't work. It throws error SPARK-5063. Following is my approach which failed:
def map_val(x):
return dicts.lookup(x)[0]
myfun = udf(lambda x: map_val(x), StringType())
df = df.withColumn('col1_map', myfun('col1')) # doesn't work
df = df.withColumn('col2_map', myfun('col2')) # doesn't work
I think the easier way is just to use a simple dictionary and df.withColumn.
from itertools import chain
from pyspark.sql.functions import create_map, lit
simple_dict = {'india':'ind', 'usa':'us', 'japan':'jpn', 'uruguay':'urg'}
mapping_expr = create_map([lit(x) for x in chain(*simple_dict.items())])
df = df.withColumn('col1_map', mapping_expr[df['col1']])\
.withColumn('col2_map', mapping_expr[df['col2']])
df.show(truncate=False)
udf way
I would suggest you to change the list of tuples to dicts and broadcast it to be used in udf
dicts = sc.broadcast(dict([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')]))
from pyspark.sql import functions as f
from pyspark.sql import types as t
def newCols(x):
return dicts.value[x]
callnewColsUdf = f.udf(newCols, t.StringType())
df.withColumn('col1_map', callnewColsUdf(f.col('col1')))\
.withColumn('col2_map', callnewColsUdf(f.col('col2')))\
.show(truncate=False)
which should give you
+-----+-------+--------+--------+
|col1 |col2 |col1_map|col2_map|
+-----+-------+--------+--------+
|india|japan |ind |jpn |
|usa |uruguay|us |urg |
+-----+-------+--------+--------+
join way (slower than udf way)
All you have to do is change the dicts rdd to dataframe too and use two joins with aliasings as following
df = sc.parallelize([('india','japan'),('usa','uruguay')]).toDF(['col1','col2'])
dicts = sc.parallelize([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')]).toDF(['key', 'value'])
from pyspark.sql import functions as f
df.join(dicts, df['col1'] == dicts['key'], 'inner')\
.select(f.col('col1'), f.col('col2'), f.col('value').alias('col1_map'))\
.join(dicts, df['col2'] == dicts['key'], 'inner') \
.select(f.col('col1'), f.col('col2'), f.col('col1_map'), f.col('value').alias('col2_map'))\
.show(truncate=False)
which should give you the same result
Similar to Ali AzG, but pulling it all out into a handy little method if anyone finds it useful
from itertools import chain
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from typing import Dict
def map_column_values(df:DataFrame, map_dict:Dict, column:str, new_column:str="")->DataFrame:
"""Handy method for mapping column values from one value to another
Args:
df (DataFrame): Dataframe to operate on
map_dict (Dict): Dictionary containing the values to map from and to
column (str): The column containing the values to be mapped
new_column (str, optional): The name of the column to store the mapped values in.
If not specified the values will be stored in the original column
Returns:
DataFrame
"""
spark_map = F.create_map([F.lit(x) for x in chain(*map_dict.items())])
return df.withColumn(new_column or column, spark_map[df[column]])
This can be used as follows
from pyspark.sql import Row, SparkSession
spark = SparkSession.builder.master("local[3]").getOrCreate()
df = spark.createDataFrame([Row(A=0), Row(A=1)])
df = map_column_values(df, map_dict={0:"foo", 1:"bar"}, column="A", new_column="B")
df.show()
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#+---+---+
#| A| B|
#+---+---+
#| 0|foo|
#| 1|bar|
#+---+---+

Categories

Resources