Pyspark merge 2 Array of Maps into 1 column with missing keys

Pyspark merge 2 Array of Maps into 1 column with missing keys - python

I have the following dataset
test_table = spark.createDataFrame(
[
("US", "CA", "S", "2022-10-01",100, 10, 1),
("US", "CA", "M", "2022-10-01",100, 15, 5),
("US", "CA", "L", "2022-10-01",100, 20, 10),
("US", "CA", "S", "2022-10-01",200, 10, 1),
("US", "CA", "M", "2022-10-01",200, 15, 5),
("US", "CA", "L", "2022-10-01",200, 20, 10),
("US", "CA", "S", "2022-10-02",100, 11, 1),
("US", "CA", "M", "2022-10-02",100, 13, 3),
("US", "CA", "L", "2022-10-02",100, 17, 7),
("US", "CA", "S", "2022-10-02",200, 11, 1),
("US", "CA", "M", "2022-10-02",200, 13, 3),
],
schema=["country_code","state_code","size","dt","store_id","ttl_sold","ttl_returned"]
)
I then do some aggregations and end up with 2 columns (latest_payload, prev_payload). These 2 columns have the following datatype.
w = Window.partitionBy("country_code", "state_code", "size", "store_id").orderBy("dt").rangeBetween(Window.unboundedPreceding,0)
w2 = Window.partitionBy("country_code", "state_code", "size").orderBy("dt")
df_w_cumulative_sum = (
test_table
.withColumn("cumulative_ttl_sold", F.sum("ttl_sold").over(w))
.withColumn("cumulative_ttl_returned", F.sum("ttl_returned").over(w))
.groupBy("dt","country_code", "state_code", "size")
.agg(F.collect_list(F.create_map(F.col("store_id"), F.struct(F.col("cumulative_ttl_sold"), F.col("cumulative_ttl_returned")))).alias("latest_payload"))
.withColumn("prev_payload", F.lag(F.col("latest_payload"), 1).over(w2))
.where(F.col("dt") == "2022-10-02")
)
row
dt
country_code
state_code
size
latest_payload
prev_payload
1
2022-10-01
US
CA
L
[{"100":{"cumulative_ttl_sold":20,"cumulative_ttl_returned":10}},{"200":{"cumulative_ttl_sold":20,"cumulative_ttl_returned":10}}]
null
2
2022-10-01
US
CA
M
[{"100":{"cumulative_ttl_sold":15,"cumulative_ttl_returned":5}},{"200":{"cumulative_ttl_sold":15,"cumulative_ttl_returned":5}}]
null
3
2022-10-01
US
CA
S
[{"100":{"cumulative_ttl_sold":10,"cumulative_ttl_returned":1}},{"200":{"cumulative_ttl_sold":10,"cumulative_ttl_returned":1}}]
null
4
2022-10-02
US
CA
L
[{"100":{"cumulative_ttl_sold":37,"cumulative_ttl_returned":17}}]
[{"100":{"cumulative_ttl_sold":20,"cumulative_ttl_returned":10}},{"200":{"cumulative_ttl_sold":20,"cumulative_ttl_returned":10}}]
5
2022-10-02
US
CA
M
[{"100":{"cumulative_ttl_sold":28,"cumulative_ttl_returned":8}},{"200":{"cumulative_ttl_sold":28,"cumulative_ttl_returned":8}}]
[{"100":{"cumulative_ttl_sold":15,"cumulative_ttl_returned":5}},{"200":{"cumulative_ttl_sold":15,"cumulative_ttl_returned":5}}]
6
2022-10-02
US
CA
S
[{"100":{"cumulative_ttl_sold":21,"cumulative_ttl_returned":2}},{"200":{"cumulative_ttl_sold":21,"cumulative_ttl_returned":2}}]
[{"100":{"cumulative_ttl_sold":10,"cumulative_ttl_returned":1}},{"200":{"cumulative_ttl_sold":10,"cumulative_ttl_returned":1}}]
Expected Output for row 4
{'100': {'cumulative_ttl_sold': 37, 'cumulative_ttl_returned': 17}, '200': {'cumulative_ttl_sold': 20, 'cumulative_ttl_returned': 10}}
Attempted Solution: Gives me the wrong values for each row
#F.udf(
MapType(
IntegerType(),
StructType([
StructField("cumulative_ttl_sold", LongType(), False),
StructField("cumulative_ttl_sold", LongType(), False)
])
)
)
def merge_payloads(lastest_payload, prev_payload):
payload: Dict[int, Dict[str, int]] = {}
if prev_payload is not None:
for latest in lastest_payload:
for k,v in latest.items():
payload[k] = v
for prev in prev_payload:
for k,v in prev.items():
if k not in payload.keys():
payload[k]=v
else:
break
else:
for latest in lastest_payload:
for k, v in latest.items():
payload[k] = v
return payload

Give this all the correct decorators and such, and it'll do what you're looking for...
def merge_payloads(latest_payload, prev_payload):
return dict(y for x in [*prev_payload, *latest_payload] for y in x.items())
latest = [{'100': {'cumulative_ttl_sold': 37, 'cumulative_ttl_returned': 17}}]
prev = [{'100': {'cumulative_ttl_sold': 20, 'cumulative_ttl_returned': 10}},
{'200': {'cumulative_ttl_sold': 20, 'cumulative_ttl_returned': 10}}]
print(merge_payloads(latest, prev))
# Output:
{'100': {'cumulative_ttl_sold': 37, 'cumulative_ttl_returned': 17},
'200': {'cumulative_ttl_sold': 20, 'cumulative_ttl_returned': 10}}

Related

How to join multiple dataframes (reduce function), rename columns to original data frame names?

I have 12 dataframes (july_df, aug_df.........june_df)
They all share the same column names and I'd like to merge them joining on the first column, "MINISTRY", and rename all the columns with the name of the df.
For example, output could look like:
Ministry item_jul item_aug.......item_jun item2_jul item2_aug.....item2jun
xyz 1 10 12 11 22 11
abc

You could try this:
import pandas as pd
def get_df_name(df):
name = [x for x in globals() if globals()[x] is df][0]
return name
# Toy dataframes
june = pd.DataFrame(
{
"ministry": ["abc", "def", "ghi", "jkl"],
"item": [1, 2, 3, 4],
"item2": [5, 6, 7, 8],
"item3": [9, 10, 11, 12],
}
)
july = pd.DataFrame(
{
"ministry": ["abc", "def", "ghi", "jkl"],
"item": [13, 14, 15, 16],
"item2": [17, 18, 19, 20],
"item3": [21, 22, 23, 24],
}
)
august = pd.DataFrame(
{
"ministry": ["abc", "def", "ghi", "jkl"],
"item": [25, 26, 27, 28],
"item2": [29, 30, 31, 32],
"item3": [33, 34, 35, 36],
}
)
dfs = [june, july, august]
# Merge dataframes on "ministry" after renaming columns
merged_dfs = dfs[0]
merged_dfs.columns = [
f"{col}_{get_df_name(merged_dfs)}" if col != "ministry" else col
for col in merged_dfs.columns
]
for df in dfs[1:]:
df.columns = [
f"{col}_{get_df_name(df)}" if col != "ministry" else col for col in df.columns
]
merged_dfs = merged_dfs.merge(df, on="ministry")
print(merged_dfs)
# Outputs
ministry item_june item2_june ... item_august item2_august item3_august
0 abc 1 5 ... 25 29 33
1 def 2 6 ... 26 30 34
2 ghi 3 7 ... 27 31 35
3 jkl 4 8 ... 28 32 36

How to convert a function to a Pandas UDF in PySpark?

I have a function in Python I would like to adapt to PySpark. I am pretty new to PySpark so finding a way to implement this - whether with a UDF or actually in PySpark is posing a challenge.
Essentially, it performs a series of numpy calculations on a grouped by dataframe. I am not entirely sure the best way to do this in PySpark
Python code:
data = [
[1, "a", 10, 23, 33],
[1, "b", 11, 25, 34],
[1, "c", 12, 35, 35],
[1, "d", 13, 40, 36],
[2, "e", 14, 56, 38],
[2, "g", 14, 56, 39],
[2, "g", 16, 40, 38],
[2, "g", 19, 87, 90],
[3, "a", 20, 12, 90],
[3, "a", 21, 45, 80],
[3, "b", 21, 45, 38],
[3, "c", 12, 45, 67],
[3, "d", 18, 45, 78],
[3, "d", 12, 78, 90],
[3, "d", 8, 85, 87],
[3, "d", 19, 87, 89],
]
df = pd.DataFrame(data, columns=["id", "sub_id", "sub_sub_id", "value_1", "value_2"])
df
grouped_df = df.groupby(["id", "sub_id", "sub_sub_id"])
aggregated_df = grouped_df.agg(
{
"value_1": ["mean", "std"],
"value_2": ["mean", "std"],
}
).reset_index()
for value in ["value_1", "value_2"]:
aggregated_df[f"{value}_calc"] = np.maximum(
aggregated_df[value]["mean"]
- grouped_df[value].min().values,
grouped_df[value].max().values
- aggregated_df[value]["mean"],
)
I was trying to perform a Window function with the already grouped and aggregated Spark Dataframe, but I am pretty sure this is not the best way to do this.
test = aggregated_sdf.withColumn(
"new_calculated_value",
spark_fns.max(
spark_fns.expr(
"ave_value_1" - spark_fns.min(spark_fns.collect_list("ave_value_1"))
),
(
spark_fns.expr(
spark_fns.max(spark_fns.collect_list("ave_value_1")) - "ave_value_1"
)
),
).over(Window.partitionBy("id", "sub_id", "sub_sub_id"))

You can try doing the calculations during the aggregation, similar to what you did in the pandas code. The equivalent of np.maximum should be F.greatest. F.max is an aggregate function which gets the maximum in a column, while F.greatest is not an aggregate function, and gets the maximum of several columns along a single row.
import pyspark.sql.functions as F
df2 = df.groupby("id", "sub_id", "sub_sub_id").agg(
F.mean('value_1').alias('ave_value_1'),
F.mean('value_2').alias('ave_value_2'),
F.greatest(
F.mean('value_1') - F.min('value_1'),
F.max('value_1') - F.mean('value_1')
).alias('value_1_calc'),
F.greatest(
F.mean('value_2') - F.min('value_2'),
F.max('value_2') - F.mean('value_2')
).alias('value_2_calc')
)

Python KeyError: 0 troubleshooting

I'm new to Gurobi and Python in general, and keep getting the error code 'KeyError: 0' on line 27 (which is the final line) whenever I run my code (which obviously isn't complete but my professor encouraged us to run our code as we write it because it's easier to troubleshoot that way).
I've read on multiple forums what that means (that it tried to access key '0' value from the dictionary where the key isn't present in that dictionary (or) which isn't initialized), but still don't really understand it?
from gurobipy import *
# Sets
SetA = ["a", "b", "c", "d", "e"]
SetB = ["f", "g", "h", "i", "j",
"k", "l", "m", "n", "o"]
A = range(len(SetA))
B = range(len(SetB))
# Data
PC = 100
X = [1, 2, 3, 4, 5]
D = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Y = [
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
[31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
]
m = Model("Problem 2")
# Variables
Z = {(a,b): m.addVar() for a in A for b in B}
# Objective
m.setObjective(quicksum((PC+X[a]+Y[a][b])*Z[a][b] for a in A for b in B), GRB.MINIMIZE)

Solution:
Change final line to:
m.setObjective(quicksum((PC+X[a]+Y[a][b])*Z[a,b] for a in A for b in B), GRB.MINIMIZE)

You are getting keyerror 0 because at the beginning of your list comprehension, where
for a in A
a is equal to 0, so at this line
m.setObjective(quicksum((PC+X[a]+Y[a][b])*Z[a][b] for a in A for b in B), GRB.MINIMIZE)
where you typed Z[a][b], you are trying to access the value of key 0 of dictionary Z (and then again key 0 of dictionary Z[a], which does not even exists), but in dictionary Z there is no key 0, since all your keys are tuples.
So as you correctly derived yourself, you don't want to access a value stored in key b of dictionary Z[a], instead you want to access the value stored in key (a, b) of dictionary Z, so
m.setObjective(quicksum((PC+X[a]+Y[a][b])*Z[a,b] for a in A for b in B), GRB.MINIMIZE)

change values of a csv file into an integer value of a dictionary

I have a .csv list of values (C3,H5,HK,HA,SK) (column names: card1, card2, card3, card4, card5) and want to change for example C3 into an integer value with a dictionary.
Let's say the dictionary is d = {'C2': 0, 'C3': 1, 'C4': 2, 'C5': 3, 'C6': 4, 'C7': 5, 'C8': 6, 'C9': 7, 'CT': 8, 'CJ': 9, 'CQ': 10, 'CK': 11, 'CA': 12, 'D2': 13, 'D3': 14, 'D4': 15, 'D5': 16, 'D6': 17, 'D7': 18, 'D8': 19, 'D9': 20, 'DT': 21, 'DJ': 22, 'DQ': 23, 'DK': 24, 'DA': 25, 'H2': 26, 'H3': 27, 'H4': 28, 'H5': 29, 'H6': 30, 'H7': 31, 'H8': 32, 'H9': 33, 'HT': 34, 'HJ': 35, 'HQ': 36, 'HK': 37, 'HA': 38, 'S2': 39, 'S3': 40, 'S4': 41, 'S5': 42, 'S6': 43, 'S7': 44, 'S8': 45, 'S9': 46, 'ST': 47, 'SJ': 48, 'SQ': 49, 'SK': 50, 'SA': 51}
this is the code I have to change the column of 'card1':
testdata = read_csv()
def convert_card_to_int(c):
if c == '' or c == ' ':
print ('card slot is empty or a blank')
return 0
if c in d:
return d.get(c)
else:
print ('card is not part of cardDict')
return 0
for index, rec in testdata.iterrows():
testdata['card1'][index] = convert_card_to_int(testdata['card1'][index])
testdata['card1'] = testdata['card1'].astype(int)
I am new in python and have not worked with dictionaries before. So I was searching some forums but did not get anything I needed, maybe I was even typing the wrong questions.
Well the problem is that I want to check if the value of the list is a value of the dictionary and if it is then it should replace it with the integer value.
The second if statement is the part where the problem occurs. Or it is in the for loop beneath. The error message tells me that it is a [TypeError: unhashable type 'dict']
testinput (file.csv):
card1,card2,card3,card4,card5
C3,H5,HK,HA,SK
C9,HJ,ST,SQ,SA
S6,S7,S8,S9,ST
testoutput:
testdata.head()
idx card1 card2 card3 card4 card5
0 1 H5 HK HA SK
1 7 HJ ST SQ SA
2 43 S7 S8 S9 ST

I think you are looking for this
df=pd.read_csv(filepath)
d is your dictionary
d={{'C2': 0, 'C3': 1, 'C4': 2, 'C5': 3, 'C6': 4, 'C7': 5, 'C8': 6....}
if you want to do for a specific columns Ex: card1 you can do like this
df['card1'] = df['card1'].apply(lambda x:str(d[x]))

Create a structured array in python

I would like to create a dictionary in Python using numpy commands.
First I tried to define the structure and then to populate the array according to a number/case selected by the user. When I try to request one of the cases I get the following error (for case 1):
cannot copy sequence with size 3 to array axis with dimension 1
How can I fix my code in order to be able to store the data I want in my structure? Regardless of the case I select.
Here is my code:
# defining the structure
usgStruct = np.zeros(1,dtype = [("satNr",np.int),
("satAzimuth", np.int),
("satElevation", np.int),
("scenarioEnv", np.str),
("scenarioHead", np.int),
("scenarioLen", np.int),
("speed", np.int)])
def case1():
usgStruct["satNr"] = 3
usgStruct["satAzimuth"] = [180, 200, 235]
usgStruct["satElevation"] = [35, 25, 25]
usgStruct["scenarioEnv"] = ["S", "S", "S", "U", "U"]
usgStruct["scenarioHead"] = [45, 280, 45, 120, 200]
usgStruct["scenarioLen"] = [2000, 500, 3000, 2000, 500]
usgStruct["speed"] = [15, 15, 15, 10, 10]
return usgStruct
def case2():
usgStruct["satNr"] = 2
usgStruct["satAzimuth"] = [180, 225]
usgStruct["satElevation"] = [45, 30]
usgStruct["scenarioEnv"] = ["U", "U", "O", "O", "S", "S", "S"]
usgStruct["scenarioHead"] = [30, 65, 65, 80, 80, 60, 130]
usgStruct["scenarioLen"] = [300, 800, 2000, 1000, 700, 700, 300]
usgStruct["speed"] = [10, 10, 15, 15, 15, 15, 15]
return usgStruct
def case3():
usgStruct["satNr"] = 2
usgStruct["satAzimuth"] = [180, 225]
usgStruct["satElevation"] = [35, 30]
usgStruct["scenarioEnv"] = ['C', 'C', 'C', 'C', 'O']
usgStruct["scenarioHead"] = [90, 45, 120, 70, 45]
usgStruct["scenarioLen"] = [1500, 500, 300, 2000, 3000]
usgStruct["speed"] = [15, 15, 15, 15, 20]
return usgStruct
# set up a dictionary of actions
scenarioGenerator = {
"1": case1,
"2": case2,
"3": case3}
runscenGen = raw_input("Please enter a number from 1 to 7\n ")
scenarioGenerator.get(runscenGen,case3)() # specify a default: case3
print usgStruct

print the initial usgStruct array:
In [329]: usgStruct
Out[329]:
array([(0, 0, 0, '', 0, 0, 0)],
dtype=[('satNr', '<i4'), ('satAzimuth', '<i4'), ('satElevation', '<i4'), ('scenarioEnv', '<U'), ('scenarioHead', '<i4'), ('scenarioLen', '<i4'), ('speed', '<i4')])
Its data is 6 numbers and one character ('U' on my py3). That's all it can hold. It can't hold lists.
Even if you defined it to be size (3,)
In [331]: usgStruct
Out[331]:
array([(0, 0, 0, '', 0, 0, 0), (0, 0, 0, '', 0, 0, 0),
(0, 0, 0, '', 0, 0, 0)],
dtype=[('satNr', '<i4'), ('satAzimuth', '<i4'), ('satElevation', '<i4'), ('scenarioEnv', '<U'), ('scenarioHead', '<i4'), ('scenarioLen', '<i4'), ('speed', '<i4')])
individual records are still this 7 element tuple.
You case data is entirely different. Each case looks like a dictionary with list values. Changing case1 to produce and return a dictionary:
In [334]: def case1():
...: usgStruct={}
...: usgStruct["satNr"] = 3
...: usgStruct["satAzimuth"] = [180, 200, 235]
...: usgStruct["satElevation"] = [35, 25, 25]
...: usgStruct["scenarioEnv"] = ["S", "S", "S", "U", "U"]
...: usgStruct["scenarioHead"] = [45, 280, 45, 120, 200]
...: usgStruct["scenarioLen"] = [2000, 500, 3000, 2000, 500]
...: usgStruct["speed"] = [15, 15, 15, 10, 10]
...: return usgStruct
...:
In [335]: case1()
Out[335]:
{'satAzimuth': [180, 200, 235],
'satElevation': [35, 25, 25],
'satNr': 3,
'scenarioEnv': ['S', 'S', 'S', 'U', 'U'],
'scenarioHead': [45, 280, 45, 120, 200],
'scenarioLen': [2000, 500, 3000, 2000, 500],
'speed': [15, 15, 15, 10, 10]}
Now scenarioGenerator would be a dictionary of dictionaries.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pyspark merge 2 Array of Maps into 1 column with missing keys - python

Related

How to join multiple dataframes (reduce function), rename columns to original data frame names?

How to convert a function to a Pandas UDF in PySpark?

Python KeyError: 0 troubleshooting

change values of a csv file into an integer value of a dictionary

Create a structured array in python

Categories

Resources