PySpark testing: construct test data consisting array of structs - python

I'd like to generate some test data for my unit tests in PySpark. One of the fields in input Row is an array of structs: basket: array<struct<price:bigint,product_id:string>>. Whats the best way to achieve it?

Here is one way using python and two helper functions responsible for generating the random data:
from pyspark.sql.types import *
from random import randrange, uniform
array_size = 2
def create_row(array_size):
return ([{"price" : uniform(1.0, 100.0), "product_id" : randrange(10) + 1} for _ in range(array_size)],)
def generate_data(data_size):
return [create_row(array_size) for _ in range(data_size)]
# create 5 rows
rows = generate_data(5)
# string schema
schema = "basket: array<struct<price:double,product_id:string>>"
# static typing schema
# schema = StructType([
# StructField('basket',
# ArrayType(
# StructType(
# [
# StructField('price', DoubleType()),
# StructField('product_id', StringType()),
# ]
# )
# )
# )])
df = spark.createDataFrame(rows, schema)
df.show(10, False)
# +--------------------------------------------------+
# |basket |
# +--------------------------------------------------+
# |[[61.40674765573896, 9], [5.994467505720648, 7]] |
# |[[1.1388272509974906, 10], [47.32070824053193, 3]]|
# |[[42.423106687845795, 2], [70.99107361888588, 4]] |
# |[[50.019594333009806, 8], [63.51239439900147, 4]] |
# |[[68.15711374321089, 9], [70.06617125228864, 10]] |
# +--------------------------------------------------+
create_row: will generate a new row (represented as a tuple here) with array_size items. price will have value in the range 1.0 - 100.0 and product_id in the range 1 - 10, please feel free to modify the boundaries accordingly. Also, we handle each item of the array (product_id-price pairs) with a python dictionary.
generate_data: calls create_row data_size times and returns the random generated rows into a list.

Related

How to coalesce multiple pyspark arrays?

I have an arbitrary number of arrays of equal length in a PySpark DataFrame. I need to coalesce these, element by element, into a single list. The problem with coalesce is that it doesn't work by element, but rather selects the entire first non-null array. Any suggestions for how to accomplish this would be appreciated. Please see the test case below for an example of expected input and output:
def test_coalesce_elements():
"""
Test array coalescing on a per-element basis
"""
from pyspark.sql import SparkSession
import pyspark.sql.types as t
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
data = [
{
"a": [None, 1, None, None],
"b": [2, 3, None, None],
"c": [5, 6, 7, None],
}
]
schema = t.StructType([
t.StructField('a', t.ArrayType(t.IntegerType())),
t.StructField('b', t.ArrayType(t.IntegerType())),
t.StructField('c', t.ArrayType(t.IntegerType())),
])
df = spark.createDataFrame(data, schema)
# Inspect schema
df.printSchema()
# root
# | -- a: array(nullable=true)
# | | -- element: integer(containsNull=true)
# | -- b: array(nullable=true)
# | | -- element: integer(containsNull=true)
# | -- c: array(nullable=true)
# | | -- element: integer(containsNull=true)
# Inspect df values
df.show(truncate=False)
# +---------------------+------------------+---------------+
# |a |b |c |
# +---------------------+------------------+---------------+
# |[null, 1, null, null]|[2, 3, null, null]|[5, 6, 7, null]|
# +---------------------+------------------+---------------+
# This obviously does not work, but hopefully provides the general idea
# Remember: this will need to work with an arbitrary and dynamic set of columns
input_cols = ['a', 'b', 'c']
df = df.withColumn('d', f.coalesce(*[f.col(i) for i in input_cols]))
# This is the expected output I would like to see for the given inputs
assert df.collect()[0]['d'] == [2, 1, 7, None]
Thanks in advance for any ideas!
Well, as Derek and OP have said, Derek's answer works but it would be better if we avoid using UDFs, so here is a way to accomplish it natively,
from pyspark.sql.window import Window
# Give it any static value as we just want row number for all the rows present in DataFrame
w = Window().orderBy(F.lit('A'))
# Will be used later tp join df with second df containing the calculated "d" column
df = df.withColumn("row_num", F.row_number().over(w))
print("DF:")
df.show(truncate=False)
# Input Columns
input_cols = ['a', 'b', 'c']
# Zip all the array using array_zip
# Explode the zipped array
# Create the new columns from the exploded zipped array to get single values
# Coalesce to get the first non-null value
# group by row_num as we want to bring all the values back in one array
# First convert to array before using collect_list as it ignore "null" values and the flatten the nested array to get one single flat array
df_2 = df.withColumn("new", F.arrays_zip(*input_cols)) \
.withColumn("new", F.explode("new")) \
.select("row_num", *[F.col(f"new.{i}").alias(f"new_{i}") for i in input_cols]) \
.withColumn("d", F.coalesce(*[(F.col(f"new_{i}")) for i in input_cols])) \
.groupBy("row_num") \
.agg(F.flatten(F.collect_list(F.array("d"))).alias("d"))
print("Second DF:")
df_2.show(truncate=False)
# Join based on the row_num
final_df = df.join(df_2, df["row_num"] == df_2["row_num"], "inner") \
.drop("row_num")
# voilĂ 
print("Final DF:")
final_df.show(truncate = False)
assert final_df.collect()[0]["d"] == [2, 1, 7, None]
DF:
+---------------------+------------------+---------------+-------+
|a |b |c |row_num|
+---------------------+------------------+---------------+-------+
|[null, 1, null, null]|[2, 3, null, null]|[5, 6, 7, null]|1 |
+---------------------+------------------+---------------+-------+
Second DF:
+-------+---------------+
|row_num|d |
+-------+---------------+
|1 |[2, 1, 7, null]|
+-------+---------------+
Final DF:
+---------------------+------------------+---------------+---------------+
|a |b |c |d |
+---------------------+------------------+---------------+---------------+
|[null, 1, null, null]|[2, 3, null, null]|[5, 6, 7, null]|[2, 1, 7, null]|
+---------------------+------------------+---------------+---------------+
Although it would be ideal, I am not sure if there is an elegant way to do this using only pyspark functions.
What I did is write a udf that takes in an variable number of columns (using *args, which you can read about here), and return an array of integers.
#f.udf(returnType=t.ArrayType(t.IntegerType()))
def get_array_non_null_first_element(*args):
data_array = [item for item in args]
array_lengths = [len(array) for array in data_array]
## check that all of the arrays have the same length
assert(len(set(array_lengths)) == 1)
## if they do, then you can set the array length
array_length = array_lengths[0]
first_value_array = []
for i in range(array_length):
element_array = [array[i] for array in data_array]
value = None
for x in element_array:
if x is not None:
value = x
break
else:
continue
first_value_array.append(value)
return first_value_array
Then create a new column d by applying this udf to whichever columns you like:
df.withColumn("d", get_array_non_null_first_element(F.col('a'), F.col('b'), F.col('c'))).show()
+--------------------+------------------+---------------+---------------+
| a| b| c| d|
+--------------------+------------------+---------------+---------------+
|[null, 1, null, n...|[2, 3, null, null]|[5, 6, 7, null]|[2, 1, 7, null]|
+--------------------+------------------+---------------+---------------+
Thanks to Derek and Tushar for their responses! I was able to use them as a basis to solve the issue without a UDF, join, or explode.
Generally speaking, joins are unfavorable due to being computationally and memory expensive, UDFs can be computationally intensive, and explode can be memory intensive. Fortunately we can avoid all of these using transform:
def add_coalesced_list_by_elements_col(
df: DataFrame,
cols: List[Union[Column, str]],
col_name: str,
) -> DataFrame:
"""
Adds a new column representing a list that is collected by element from the
input set. Please note that all provided this does not check that all provided
columns are of equal length.
Args:
df: Input DataFrame to add column to
cols: List of columns to collect by element. All columns should be of equal length.
col_name: The name of the new column
Returns:
DataFrame with result added as a new column.
"""
return (
df
.withColumn(
col_name,
f.transform(
# Doesn't matter which col, we don't use this val
cols[0],
# We use i + 1 because sql array index starts at 1, while transform index starts at 0
lambda _, i: f.coalesce(*[f.element_at(c, i + 1) for c in cols]))
)
)
def test_collect_list_elements():
from typing import List
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import SparkSession, DataFrame, Column, Window
# Arrange
spark = SparkSession.builder.getOrCreate()
data = [
{
"id": 1,
"a": [None, 1, None, None],
"b": [2, 3, None, None],
"c": [5, 6, 7, None],
}
]
schema = t.StructType(
[
t.StructField("id", t.IntegerType()),
t.StructField("a", t.ArrayType(t.IntegerType())),
t.StructField("b", t.ArrayType(t.IntegerType())),
t.StructField("c", t.ArrayType(t.IntegerType())),
]
)
df = spark.createDataFrame(data, schema)
# Act
df = add_coalesced_list_by_elements_col(df=df, cols=["a", "b", "c"], col_name="d")
# Assert new col is correct output
assert df.collect()[0]["d"] == [2, 1, 7, None]
# Assert all the other cols are not affected
assert df.collect()[0]["a"] == [None, 1, None, None]
assert df.collect()[0]["b"] == [2, 3, None, None]
assert df.collect()[0]["c"] == [5, 6, 7, None]

How to efficiently label randomly rows in each group of a big dataframe?

I have a dataframe df containing 40 millions of rows. There is a column named group_id to specific the group identifier of a row. There is a total of 2000 groups.
I would like to label randomly elements in each group and add this information to a column batch of df. For example, if group 1 contains rows 1, 2, 3, 4, and 5, then I choose a permutation of (1, 2, 3, 4, 5), for example, we take (5, 3, 4, 2, 1). Then I assign to a column batch of these rows the values [5, 3, 4, 2, 1].
I defined a function func and used parallelization dummy.Pool, but the speed is very slow. Could you suggest a faster way to do so?
import pandas as pd
import numpy as np
import random
import os
from multiprocessing import dummy
import itertools
core = os.cpu_count()
P = dummy.Pool(processes = core)
N = int(4e7)
M = int(2e3) + 1
col_1 = np.random.randint(1, M, N)
col_2 = np.random.uniform(low = 1, high = 5, size = N)
df = pd.DataFrame({'group_id': col_1, 'value': col_2})
df.sort_values(by = 'group_id', inplace = True)
df.reset_index(inplace = True, drop = True)
id_ = np.unique(df.group_id)
def func(i):
idx = df.group_id == i
m = sum(idx) # count the number of rows in each group
r = list(range(1, m + 1, 1)) # create an enumeration
random.shuffle(r) # create a permutation the enumeration
return(r)
order_list = P.map(func, id_)
# merge the list containing permutations
order = list(itertools.chain.from_iterable(order_list))
df['batch'] = order
Perhaps this could solve you problem. Take a random permutation of the group size.
import numpy as np
import pandas as pd
l = np.repeat([x for x in range(2000)],20000)
df = pd.DataFrame(l, columns=['group'])
df['batch'] = df.groupby('group')['group'].transform(lambda x: np.random.permutation(np.arange(x.size)))

iterate over a list, apply a function and set each output to rows of a pandas dataframe

So I have the following data, which comes from two different pandas dataframes:
lis = []
for index, rows in full.iterrows():
my_list = [rows.ARIEL, rows.LBHD, rows.LFHD, rows.RFHD, rows.RBHD]
lis.append(my_list)
lis2 = []
for index, rows in reduced.iterrows():
my_list = rows.bar_head
lis2.append(my_list)
For example, part of lis and lis are shown below:
lis = [[[-205.981, 1638.787, 1145.274], [-264.941, 1482.371, 1168.693], [-263.454, 1579.4370000000001, 1016.279], [-148.062, 1592.005, 1016.75], [-134.313, 1479.1429999999998, 1167.109]], ...
lis2 = [[-203.3502, 1554.3486, 1102.821], [-203.428, 1554.3492, 1103.0592], [-203.4954, 1554.3234, 1103.2794], [-203.5022, 1554.2974, 1103.4522], ...
What I want is to use lis and lis2 with the following apply method (where mdf is another empty dataframe of the same length as the other two, and md is a function I've created):
mdf['head_md'] = mdf['head_md'].apply(md, args=(5, lis, lis2))
But the way it does it now, is it output the same result to all rows of mdf.
What I want is for it to loop through lis and lis2 and based on the indexes, to output the corresponding result to the corresponding row of mdf. All dataframes and variables have length 7446.
I tried for example this, but it doesn't work:
for i in range(len(mdf)):
for j in range(0, 5):
mdf['head_md'] = mdf['head_md'].apply(md, args=(5, lis[i][j], lis2[i]))
Let me know if you need any more information from the code, and thanks in advance!
EDIT: Examples of the dataframes:
bar_head
0 [-203.3502, 1554.3486, 1102.821]
1 [-203.428, 1554.3492, 1103.0592]
2 [-203.4954, 1554.3234, 1103.2794]
3 [-203.5022, 1554.2974, 1103.4522]
4 [-203.5014, 1554.2948, 1103.6594]
ARIEL LBHD LFHD RBHD RFHD
0 [-205.981, 1638.787, 1145.274] [-264.941, 1482.371, 1168.693] [-263.454, 1579.4370000000001, 1016.279] [-134.313, 1479.1429999999998, 1167.109] [-148.062, 1592.005, 1016.75]
1 [-206.203, 1638.649, 1145.734] [-264.85400000000004, 1482.069, 1168.776] [-263.587, 1579.6129999999998, 1016.627] [-134.286, 1479.0839999999998, 1167.076] [-148.21, 1592.3310000000001, 1017.0830000000001]
2 [-206.37599999999998, 1638.531, 1146.135] [-264.803, 1481.8210000000001, 1168.8519999999... [-263.695, 1579.711, 1016.922] [-134.265, 1478.981, 1167.104] [-148.338, 1592.5729999999999, 1017.3839999999...
3 [-206.493, 1638.405, 1146.519] [-264.703, 1481.5439999999999, 1168.95] [-263.742, 1579.8139999999999, 1017.207] [-134.15200000000002, 1478.922, 1167.112] [-148.421, 1592.8020000000001, 1017.4730000000...
4 [-206.56900000000002, 1638.33, 1146.828] [-264.606, 1481.271, 1169.0330000000001] [-263.788, 1579.934, 1017.467] [-134.036, 1478.888, 1167.289] [-148.50799999999998, 1593.0510000000002, 1017...
If the items in the columns of full and reduced are lists convert them to numpy ndarrays first.
ariel = np.array(full.ARIEL.to_list())
lbhd = np.array(full.LBHD.to_list())
lfhd = np.array(full.LFHD.to_list())
rfhd = np.array(full.RFHD.to_list())
rbhd = np.array(full.RBHD.to_list())
barhead = np.array(reduced.bar_head.to_list())
Subtract barhead from ariel using broadcasting, square the results and sum along the last axis (assuming I understood the comment about your function).
a = np.sum(np.square(ariel-barhead[:,None,:]),-1)
Using the setup below the result is a (4,5) array of values (rounded to two places).
>>> a # a[0] a[1] a[2] a[3] a[4]
array([[8939.02, 8956.22, 8971.93, 8984.87, 8999.85], # b[0]
[8918.35, 8935.3 , 8950.79, 8963.53, 8978.35], # b[1]
[8903.82, 8920.53, 8935.82, 8948.36, 8963.04], # b[2]
[8893.7 , 8910.24, 8925.38, 8937.78, 8952.34]]) # b[3]
It seemed that you wanted a 1-d sequence for the result: a.ravel() produces a 1-d array like:
[(a[0]:b[0]),(a[1]:b[0]),(a[2]:b[0]),...,(a[0]:b[1]),(a[1]:b[1]),...,(a[0]:b[2]),...]
The other four columns of full.
lb = np.sum(np.square(lbhd-barhead[:,None,:]),-1)
lf = np.sum(np.square(lfhd-barhead[:,None,:]),-1)
rf = np.sum(np.square(rfhd-barhead[:,None,:]),-1)
rb = np.sum(np.square(rbhd-barhead[:,None,:]),-1)
Again assuming I understood your process the result would be 100 values (using the setup below).
full reduced
(rows * columns) * (rows)
x = np.concatenate([a.ravel(),lb.ravel(),lf.ravel(),rf.ravel(),rb.ravel()])
Setup
import numpy as np
import pandas as pd
lis = [[[-205.981, 1638.787, 1145.274],[-264.941, 1482.371, 1168.693],[-263.454, 1579.437, 1016.279],[-134.313, 1479.1429, 1167.109],[-148.062, 1592.005, 1016.75]],
[[-206.203, 1638.649, 1145.734],[-264.854, 1482.069, 1168.776],[-263.587, 1579.6129, 1016.627],[-134.286, 1479.0839, 1167.076],[-148.21, 1592.331, 1017.083]],
[[-206.3759, 1638.531, 1146.135],[-264.803, 1481.821, 1168.85199],[-263.695, 1579.711, 1016.922],[-134.265, 1478.981, 1167.104],[-148.338, 1592.5729, 1017.3839]],
[[-206.493, 1638.405, 1146.519],[-264.703, 1481.5439, 1168.95],[-263.742, 1579.8139, 1017.207],[-134.152, 1478.922, 1167.112],[-148.421, 1592.802, 1017.473]],
[[-206.569, 1638.33, 1146.828],[-264.606, 1481.271, 1169.033],[-263.788, 1579.934, 1017.467],[-134.036, 1478.888, 1167.289],[-148.5079, 1593.051, 1017.666]]]
barhd = [[[-203.3502, 1554.3486, 1102.821]],
[[-203.428, 1554.3492, 1103.0592]],
[[-203.4954, 1554.3234, 1103.2794]],
[[-203.5022, 1554.2974, 1103.4522]]]
full = pd.DataFrame(lis, columns=['ARIEL', 'LBHD', 'LFHD', 'RFHD', 'RBHD'])
reduced = pd.DataFrame(barhd,columns=['bar_head'])
I hope to understand you well, is it what you want?
v is lis and v2 is lis2.
Arithmatic function for 3D by 2D.
import numpy as np
na = np.array
v=na([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9],[ 10, 11, 1]]])
v2=na([[1, 2, 3], [4, 5, 6], [7, 8, 9],[ 10, 11, 12]])
lst = []
for a in v:
for b in a:
for a2 in v2:
lst.append(b+a2) # you can do any arithmetic functions

Drop number of rows in dataframe

I have a dataframe contain 25000 rows with two columns (text , class)
class contains a number of [A,B,C]
data = pd.read_csv('E:\mydata.txt', sep="*")
data.columns = ["text", "class"]
I need delete for example 10 rows of class A, 15 rows of class B
You can achieve this with conditional slicing and the index property of dataframes
remove_n = 10
remove_class = 1
# Here you first find the indexes where class is equal to the class you want to drop.
#Then you slice only the first n indexes of this class
index_to_drop = data.index[data['class'] == remove_class][:remove_n]
#Finally drop those indexes
data = data.drop(index_to_drop)
You can construct a single Boolean mask via np.logical_and and groupby.cumcount. Then apply it to your dataframe via iloc:
# example dataframe
df = pd.DataFrame({'group': np.random.randint(0, 3, 100),
'value': np.random.random(100)})
print(df.shape) # (100, 2)
# criteria input
criteria = {1: 10, 2: 15}
# cumulative count by group
cum_count = df.groupby('group').cumcount()
# Boolean mask, negative via ~
conditions = [(df['group'].eq(k) & cum_count.lt(v)) for k, v in criteria.items()]
mask = ~np.logical_or.reduce(conditions)
# apply Boolean mask
res = df.iloc[mask]
print(res.shape) # (75, 2)

Spark union of multiple RDDs

In my pig code I do this:
all_combined = Union relation1, relation2,
relation3, relation4, relation5, relation 6.
I want to do the same with spark. However, unfortunately, I see that I have to keep doing it pairwise:
first = rdd1.union(rdd2)
second = first.union(rdd3)
third = second.union(rdd4)
# .... and so on
Is there a union operator that will let me operate on multiple rdds at a time:
e.g. union(rdd1, rdd2,rdd3, rdd4, rdd5, rdd6)
It is a matter on convenience.
If these are RDDs you can use SparkContext.union method:
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([4, 5, 6])
rdd3 = sc.parallelize([7, 8, 9])
rdd = sc.union([rdd1, rdd2, rdd3])
rdd.collect()
## [1, 2, 3, 4, 5, 6, 7, 8, 9]
There is no DataFrame equivalent but it is just a matter of a simple one-liner:
from functools import reduce # For Python 3.x
from pyspark.sql import DataFrame
def unionAll(*dfs):
return reduce(DataFrame.unionAll, dfs)
df1 = sqlContext.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v"))
df2 = sqlContext.createDataFrame([(3, "foo2"), (4, "bar2")], ("k", "v"))
df3 = sqlContext.createDataFrame([(5, "foo3"), (6, "bar3")], ("k", "v"))
unionAll(df1, df2, df3).show()
## +---+----+
## | k| v|
## +---+----+
## | 1|foo1|
## | 2|bar1|
## | 3|foo2|
## | 4|bar2|
## | 5|foo3|
## | 6|bar3|
## +---+----+
If number of DataFrames is large using SparkContext.union on RDDs and recreating DataFrame may be a better choice to avoid issues related to the cost of preparing an execution plan:
def unionAll(*dfs):
first, *_ = dfs # Python 3.x, for 2.x you'll have to unpack manually
return first.sql_ctx.createDataFrame(
first.sql_ctx._sc.union([df.rdd for df in dfs]),
first.schema
)
You can also use addition for UNION between RDDs
rdd = sc.parallelize([1, 1, 2, 3])
(rdd + rdd).collect()
## [1, 1, 2, 3, 1, 1, 2, 3]
Unfortunately it's the only way to UNION tables in Spark. However instead of
first = rdd1.union(rdd2)
second = first.union(rdd3)
third = second.union(rdd4)
...
you can perform it in a little bit cleaner way like this:
result = rdd1.union(rdd2).union(rdd3).union(rdd4)

Categories

Resources