GCP Dataflow Apache Beam code logic not working as expected

GCP Dataflow Apache Beam code logic not working as expected - python

I am trying to implement a CDC in Apache Beam, deployed in Google Cloud Dataflow.
I have unloaded the master data and the new data, which is expected to coming daily.
The join is not working as expected. Something is missing.
master_data = (
p | 'Read base from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=master_data, use_standard_sql=True))
| 'Map id in master' >> beam.Map(lambda master: (
master['id'], master)))
new_data = (
p | 'Read Delta from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=new_data, use_standard_sql=True))
| 'Map id in new' >> beam.Map(lambda new: (new['id'], new)))
joined_dicts = (
{'master_data' :master_data, 'new_data' : new_data }
| beam.CoGroupByKey()
| beam.FlatMap(join_lists)
| 'mergeddicts' >> beam.Map(lambda masterdict, newdict: newdict.update(masterdict))
)
def join_lists(k,v):
itertools.product(v['master_data'], v['new_data'])
Observations (on sample data):
Data from the master
1, 'A',3232
2, 'B',234
New Data:
1,'A' ,44
4,'D',45
Expected result in master table, post the code implementation:
1, 'A',44
2, 'B',234
4,'D',45
However, what I am getting in master table is:
1,'A' ,44
4,'D',45
Am I missing a step? Can anyone please assist in rectifying my mistake.

You don't need to flatten after group by as it separates the elements again.
Here is the sample code.
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam as beam
def join_lists(e):
(k,v)=e
return (k, v['new_data']) if v['new_data'] != v['master_data'] else (k, None)
with beam.Pipeline(options=PipelineOptions()) as p:
master_data = (
p | 'Read base from BigQuery ' >> beam.Create([('A', [3232]),('B', [234])])
)
new_data = (
p | 'Read Delta from BigQuery ' >> beam.Create([('A',[44]),('D',[45])])
)
joined_dicts = (
{'master_data' :master_data, 'new_data' : new_data }
| beam.CoGroupByKey()
| 'mergeddicts' >> beam.Map(join_lists)
)
result = p.run()
result.wait_until_finish()
print("Pipeline finished.")

Related

I want to pass side input AsDIct but getting error "ValueError: dictionary update sequence element #0 has length 101; 2 is required"

class load_side_input(beam.DoFn):
def process(self,pubsub_message):
message = pubsub_message.decode("utf8")
output:typing.Dict={}
for key in message.keys():
output[key] = self.tag_model[key]
return [output]
side_input = (p
| "AMM Events" >> beam.io.ReadFromPubSub(subscription=opts.ammSub)
| "Trigger event" >> beam.WindowInto(window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)),
accumulation_mode=trigger.AccumulationMode.DISCARDING)
| "Parse and Update Cache" >> beam.ParDo(load_side_input())
)
enrichment = (rows
| 'Data Validation and Enrichment' >> beam.ParDo(validation(),y_side=AsDict(side_input))
)
File "/usr/local/lib/python3.9/site-packages/apache_beam/runners/worker/bundle_processor.py", line 434, in __getitem__
self._cache[target_window] = self._side_input_data.view_fn(raw_view)
ValueError: dictionary update sequence element #0 has length 101; 2 is required [while running 'Data Enrichment-ptransform-128']

You feed the function beam.pvalue.AsDict the incorrect input format. According to the documentation:
Parameters: pcoll – Input pcollection. All elements should be key-value pairs (i.e. 2-tuples) with unique keys.
Here is a minimum working example, which can be run at Apache Play
import apache_beam as beam
def use_side_input(main, side_input):
return side_input[main]
class BadSideInputCreator(beam.DoFn):
def process(self, element):
output = {}
output['1'] = 'value1'
output['2'] = 'value2'
yield [output] # this is a list of an dict and not a 2-tuple
class GoodSideInputCreator(beam.DoFn):
def process(self, element):
output = {}
output['1'] = 'value1'
output['2'] = 'value2'
for key, value in output.items():
yield (key, value) # this is a 2-tuple
with beam.Pipeline() as pipeline:
main = (
pipeline
| "init main" >> beam.Create(['1', '2'])
)
side = (
pipeline
| "init side" >> beam.Create(['dummy'])
| beam.ParDo(BadSideInputCreator()) # replace with GoodSideInputCreator
)
(
main
| "use side input" >> beam.Map(use_side_input, side_input=beam.pvalue.AsDict(side))
| "print" >> beam.Map(print)
)
Running with BadSideInputCreator throws your error
ValueError: dictionary update sequence element #0 has length 1; 2 is required
while with GoodSideInputCreator we get the expected result
value1
value2

"Can't pickle generator objects" in Apache Beam - Python

I'm getting the following error while running the DirectRunner:
can't pickle generator objects [while running 'Assign Side Columns']
The error is popping up when the beam is writing the "mapped rows" to BigQuery.
Any idea where this error is coming from?
def assign_columns(row, mapping):
main_account = row['filepath_platform_id']
main_report = row['filepath_report_name']
for _mapping in mapping:
_side_account = _mapping['account_id']
_side_report = _mapping['report_name']
if main_account == _side_account and main_report == _side_report:
_cols = json.loads(_mapping['mapping'])
row = renames_columns(row, _cols)
yield row
def run(argv=None, save_main_session=True):
options = PipelineOptions(flags=argv)
options.view_as(SetupOptions).save_main_session = save_main_session
table_spec = 'SELECT * FROM `nextwork-staging.datalake.v_mapping_reports`'
with beam.Pipeline(options=options) as p:
common_options = options.view_as(CommonOptions)
file_metadata = (
p
| 'Create empty PCollection' >> beam.Create(['Start'])
| 'Load Input Location' >> beam.ParDo(CreateFilesFromGlobByDate(common_options.input, None, None, None, common_options.starting_date, common_options.ending_date, common_options.only_last_file_from_input))
)
rows = (
file_metadata | 'GetIncomeAccessFn' >> beam.ParDo(GetIncomeAccessFn())
| 'Map to regular dictionary' >> beam.Map(lambda x: dict(x))
| 'TransformDate' >> beam.ParDo(TransformDateFromStringFn())
)
side_input = (
p
| 'ReadTable' >> beam.io.ReadFromBigQuery(query=table_spec, use_standard_sql=True)
| 'Map Columns' >> beam.Map(lambda x: dict(x))
)
mapped_rows = (
rows | 'Assign Side Columns' >> beam.Map(assign_columns, mapping=beam.pvalue.AsIter(side_input))
| 'Rename column' >> beam.Map(renames_columns, names={'site': 'campaign'})
| 'TransformForBigQuery' >> beam.ParDo(TransformForBigQueryFn())
)
gcp_options = options.view_as(GoogleCloudOptions)
(
mapped_rows
| 'WriteToBigQueryUsingParameter' >> bigquery_file_loads.BigQueryBatchFileLoads(
destination=_get_table_name,
schema=_get_schema,
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
additional_bq_parameters=_get_additional_bq_parameters,
custom_gcs_temp_location=gcp_options.temp_location
)
)

Python - Apace Beam [GCP] - PubSub to GCS working with direct runner, but not with DataFlow runner

I have an Apache beam pipeline written in python which reads data from pubSub Topic, then aggregates and transposes the data and writes to GCS.
When I run the python script using direct runner, the pipeline works and GCS file is populated. When I try to deploy in cloud dataflow it throws me an error :
Error log in Dataflow UI :
I tried the same transformation to write back to BigQuery Table and it worked fine with dataflow runner.
Apache Beam Version : apache-beam==2.27.0
Command used to execute :
python3 pubSubToGCS.py --runner DataFlow --project <PROJECT_ID> \
--temp_location gs://<PROJECT_ID>-temp-bucket/tmp \
--staging_location gs://<PROJECT_ID>-temp-bucket/staging \
--streaming --job_name dataloading --region us-central1 \
--SUBSCRIPTION_NAME projects/<PROJECT_ID>/subscriptions/RawSensorReadings-sub \
--GCS_PATH gs://<PROJECT_ID>/RawDataReadings/TEST
pubSubToGCS.py :
import argparse
import datetime
import logging
import time
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam.transforms.window as window
import pandas as pd
from apache_beam.io import WriteToText
def convertToCsv(sensorValues):
(timestamp, values) = sensorValues
df = pd.DataFrame(values)
df.columns = ["Sensor","Value"]
csvStr = str(timestamp)+","+",".join(str(x) for x in list(df.groupby(["Sensor"]).mean().T.iloc[0]))
return csvStr
def roundTime(dt=None, roundTo=60):
if dt == None : dt = datetime.datetime.now()
seconds = (dt.replace(tzinfo=None) - dt.min).seconds
rounding = (seconds+roundTo/2) // roundTo * roundTo
return str(dt + datetime.timedelta(0,rounding-seconds,-dt.microsecond))
def run(subscription_path, output_gcs_path, interval=1, pipeline_args=None):
with beam.Pipeline(options=PipelineOptions( pipeline_args, streaming=True, save_main_session=True)) as p:
data = (p
| 'ReadData' >> beam.io.ReadFromPubSub(subscription=subscription_path)
| "Decode" >> beam.Map(lambda x: x.decode('utf-8'))
#RAW messages comes in format :
# ....
# 2021-02-15 11:58:25.637077,Pressure_2,2064.6582700689737
# 2021-02-15 11:58:25.638462,Pressure_3,2344.5496650659934
# 2021-02-15 11:58:25.639584,Pressure_4,2124.8628543586183
# 2021-02-15 11:58:25.640560,Pressure_5,2026.8313489428847
# ....
| "Convert to list" >> beam.Map(lambda x: x.split(","))
| "to tuple" >> beam.Map(lambda x: (roundTime(datetime.datetime.strptime(x[0],'%Y-%m-%d %H:%M:%S.%f'), roundTo = interval), [x[1] , float(x[2]) ] ))
# After converting to tuple :
# ('2021-02-15 11:58:23', ['Pressure_4', 2053.5058784228463])
# ('2021-02-15 11:58:23', ['Pressure_5', 1554.4377708096179])
# ('2021-02-15 11:58:23', ['Pressure_5', 1584.4305417337491])
# ('2021-02-15 11:58:24', ['Pressure_1', 2392.3361259850535])
# ('2021-02-15 11:58:24', ['Pressure_2', 1975.1719957768883])
# ('2021-02-15 11:58:24', ['Pressure_3', 2713.306279034812])
# ('2021-02-15 11:58:24', ['Pressure_4', 1902.3089540631288])
# ('2021-02-15 11:58:24', ['Pressure_5', 1932.9791896923427])
# ('2021-02-15 11:58:26', ['Pressure_1', 2022.983225929031])
# ('2021-02-15 11:58:26', ['Pressure_2', 1799.7312618480091])
# ('2021-02-15 11:58:26', ['Pressure_3', 2166.0263912901733])
| "Window" >> beam.WindowInto(window.FixedWindows(15))
| "Groupby" >> beam.GroupByKey()
| "convert to csv" >> beam.Map(convertToCsv)
# After aggegation and Transpose : (Header just for reference, not included in PCOLL)
# Timestamp , Pressure_1 , Pressure_2 , Pressure_3 , Pressure_4 , Pressure_5
# 2021-02-15 12:09:35,1965.012352391227,2004.429036333092,2251.4681479138094,1771.5563402502842,1756.392145648632
# 2021-02-15 12:09:36,1825.3075719663318,2259.6443056432713,2072.838378531564,2120.7269395022995,1783.3072412809652
# 2021-02-15 12:09:37,2275.9356184022963,1989.6216653552221,2157.401713549938,1657.4214496485536,1923.4706666235156
# 2021-02-15 12:09:38,1892.8002062318985,1945.9966592074343,2317.891325891871,1636.2570086301153,1971.247401688967
# 2021-02-15 12:09:39,1944.2184082592953,1913.9882349839,2244.7614223797464,1893.985945158099,1579.50215334007
| "Write to GCS" >> WriteToText(output_gcs_path)
# This works with Direct runner, but with dataflow runner doesn't work
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"--SUBSCRIPTION_NAME",
help="The Cloud Pub/Sub subscription to read from.\n"
'"projects/<PROJECT_ID>/subscriptions/<SUBSCRIPTION_NAME>".',
)
parser.add_argument(
"--GCS_PATH",
help = "Path to GCP cloud storage buket\n"
'"<gs://<PROJECT_ID>/<BUCKET>/<PATH>"')
parser.add_argument(
"--AGGREGATION_INTERVAL",
type = int,
default = 1,
help="Number of seconds to aggregate.\n",
)
args, pipeline_args = parser.parse_known_args()
run(
args.SUBSCRIPTION_NAME,
args.GCS_PATH,
args.AGGREGATION_INTERVAL,
pipeline_args
)

Use Pyspark df.select() to stack the output

I have the following code: . .
base_curr = 'BTC,XRP,ETH'
uri = f"https://min-api.cryptocompare.com/data/pricemultifull?fsyms={base_curr}&tsyms=JPY"
r = requests.get(uri)rdd = sc.parallelize([r.text])
json_df = sqlContext.read.json(rdd)
json_df = json_df.select(
col('RAW.BTC.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.BTC.JPY.PRICE').alias('PRICE'),
F.col('RAW.BTC.JPY.LASTUPDATE').alias('LASTUPDATE'),
F.col('RAW.XRP.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.XRP.JPY.PRICE').alias('PRICE'),
F.col('RAW.XRP.JPY.LASTUPDATE').alias('LASTUPDATE'),
F.col('RAW.ETH.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.ETH.JPY.PRICE').alias('PRICE'),
F.col('RAW.ETH.JPY.LASTUPDATE').alias('LASTUPDATE')
)
print(json_df)
json_df.show()
json_df.printSchema()
The output is:
+----------+----------+----------+----------+-----+----------+----------+--------+----------+
|FROMSYMBOL| PRICE|LASTUPDATE|FROMSYMBOL|PRICE|LASTUPDATE|FROMSYMBOL| PRICE|LASTUPDATE|
+----------+----------+----------+----------+-----+----------+----------+--------+----------+
| BTC|1994390.75|1607158504| XRP|61.85|1607158490| ETH|61874.24|1607158509|
+----------+----------+----------+----------+-----+----------+----------+--------+----------+
But the desired output is:
+----------+----------+----------+
|FROMSYMBOL| PRICE|LASTUPDATE|
+----------+----------+----------+
| BTC|1994390.75|1607158504|
+----------+----------+----------+
| XRP|61.85 |1607158490|
+----------+----------+----------+
| ETH|61874.24 |1607158509|
+----------+----------+----------+
Any help would be so great. I tried several method without success.

Does this work for you?
base_curr = 'BTC,XRP,ETH'
uri = f"https://min-api.cryptocompare.com/data/pricemultifull?fsyms={base_curr}&tsyms=JPY"
r = requests.get(uri)
rdd = sc.parallelize([r.text])
json_df = sqlContext.read.json(rdd)
json_df = json_df.select(
F.explode(
F.array(
F.array(F.col('RAW.BTC.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.BTC.JPY.PRICE').alias('PRICE'),
F.col('RAW.BTC.JPY.LASTUPDATE').alias('LASTUPDATE')
),
F.array(F.col('RAW.XRP.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.XRP.JPY.PRICE').alias('PRICE'),
F.col('RAW.XRP.JPY.LASTUPDATE').alias('LASTUPDATE')
),
F.array(F.col('RAW.ETH.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.ETH.JPY.PRICE').alias('PRICE'),
F.col('RAW.ETH.JPY.LASTUPDATE').alias('LASTUPDATE')
)
)
).alias('arrays')
).select(
F.col('arrays')[0].alias('FROMSYMBOL'),
F.col('arrays')[1].alias('PRICE'),
F.col('arrays')[2].alias('LASTUPDATE'),
)

Creating a formatted table from a list of tuples

I have a list of data that looks something like this:
[
(
1,
u'python -c \'print("ok")\'',
u'data',
u'python'
), (
2,
u'python -c \'print("this is some data")\'',
u'data',
u'python'
)
]
This data is taken out of a database and displayed as this, and is continuously growing. What I would like to do is display the data like this:
Language | Type | Payload
-------------------------------
python | data | python -c 'print("ok")'
python | data | python -c 'print("this is some data")'
I have a function that kinda does the same thing, but it's not exactly as expected:
def print_table(data, cols, width):
n, r = divmod(len(data), cols)
pattern = "{{:{}}}".format(width)
line = "\n".join(pattern * cols for _ in range(n))
last_line = pattern * r
print(line.format(*data))
print(last_line.format(*data[n*cols]))
How can I get the output of my data to look as wanted? From the answers it's possible with pandas but I would also like a way to do it without installing external modules

Analyze the data for its max-width and use string formatting - some 'creative' formatting later:
data = [
(
1,
u'python -c \'print("ok")\'',
u'data',
u'python'
), (
2,
u'python -c \'print("this is some data")\'',
u'data',
u'python'
)
]
def print_table(data):
widths = {0:0, 3:len("Language"),2:len("Type"),1:len("Payload")}
for k in data:
for i,d in enumerate(k):
widths[i] = max(widths[i],len(str(d)))
# print(widths)
lan, typ, pay = ["Language","Type","Payload"]
print(f"{lan:<{widths[3]}} | {typ:<{widths[2]}} | {pay:<{widths[1]}}")
# adjust by 10 for ' | ' twice
print("-" * (widths[1]+widths[2]+widths[3]+10))
for k in data:
_, pay, typ, lan = k
print(f"{lan:<{widths[3]}} | {typ:<{widths[2]}} | {pay:<{widths[1]}}")
Output:
Language | Type | Payload
------------------------------------------------------------
python | data | python -c 'print("ok")'
python | data | python -c 'print("this is some data")'
string format mini language
Equivalent Python 2.7 code:
# w == widths - would break 79 chars/line else wise
def print_table(data):
w = {0:0, 3:len("Language"),2:len("Type"),1:len("Payload")}
for k in data:
for i,d in enumerate(k):
w[i] = max(w[i],len(str(d)))
lan, typ, pay = ["Language","Type","Payload"]
print "{:<{}} | {:<{}} | {:<{}}".format(lan, w[3], typ, w[2], pay, w[1])
print "-" * (w[1]+w[2]+w[3]+10)
for k in data:
_, pay, typ, lan = k
print "{:<{}} | {:<{}} | {:<{}}".format(lan, w[3], typ, w[2], pay, w[1])

Solution that handles any number of columns:
from operator import itemgetter
data = [
('ID', 'Payload', 'Type', 'Language'),
(1, u'python -c \'print("ok")\'', u'data', u'python'),
(2, u'python -c \'print("this is some data")\'', u'data', u'python')
]
def print_table(data):
lengths = [
[len(str(x)) for x in row]
for row in data
]
max_lengths = [
max(map(itemgetter(x), lengths))
for x in range(0, len(data[0]))
]
format_str = ''.join(map(lambda x: '%%-%ss | ' % x, max_lengths))
print(format_str % data[0])
print('-' * (sum(max_lengths) + len(max_lengths) * 3 - 1))
for x in data[1:]:
print(format_str % x)
print_table(data)
Output:
$ python table.py
ID | Payload | Type | Language |
---------------------------------------------------------------
1 | python -c 'print("ok")' | data | python |
2 | python -c 'print("this is some data")' | data | python |

You can use pandas for that:
import pandas as pd
data = pd.DataFrame(a, columns=['id','Payload', 'type', 'Language'])
print(data)
gives you:
id Payload type Language
0 1 python -c 'print("ok")' data python
1 2 python -c 'print("this is some data")' data python

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

GCP Dataflow Apache Beam code logic not working as expected - python

Related

I want to pass side input AsDIct but getting error "ValueError: dictionary update sequence element #0 has length 101; 2 is required"

"Can't pickle generator objects" in Apache Beam - Python

Python - Apace Beam [GCP] - PubSub to GCS working with direct runner, but not with DataFlow runner

Use Pyspark df.select() to stack the output

Creating a formatted table from a list of tuples

Categories

Resources