"Can't pickle generator objects" in Apache Beam - Python

"Can't pickle generator objects" in Apache Beam - Python - python

I'm getting the following error while running the DirectRunner:
can't pickle generator objects [while running 'Assign Side Columns']
The error is popping up when the beam is writing the "mapped rows" to BigQuery.
Any idea where this error is coming from?
def assign_columns(row, mapping):
main_account = row['filepath_platform_id']
main_report = row['filepath_report_name']
for _mapping in mapping:
_side_account = _mapping['account_id']
_side_report = _mapping['report_name']
if main_account == _side_account and main_report == _side_report:
_cols = json.loads(_mapping['mapping'])
row = renames_columns(row, _cols)
yield row
def run(argv=None, save_main_session=True):
options = PipelineOptions(flags=argv)
options.view_as(SetupOptions).save_main_session = save_main_session
table_spec = 'SELECT * FROM `nextwork-staging.datalake.v_mapping_reports`'
with beam.Pipeline(options=options) as p:
common_options = options.view_as(CommonOptions)
file_metadata = (
p
| 'Create empty PCollection' >> beam.Create(['Start'])
| 'Load Input Location' >> beam.ParDo(CreateFilesFromGlobByDate(common_options.input, None, None, None, common_options.starting_date, common_options.ending_date, common_options.only_last_file_from_input))
)
rows = (
file_metadata | 'GetIncomeAccessFn' >> beam.ParDo(GetIncomeAccessFn())
| 'Map to regular dictionary' >> beam.Map(lambda x: dict(x))
| 'TransformDate' >> beam.ParDo(TransformDateFromStringFn())
)
side_input = (
p
| 'ReadTable' >> beam.io.ReadFromBigQuery(query=table_spec, use_standard_sql=True)
| 'Map Columns' >> beam.Map(lambda x: dict(x))
)
mapped_rows = (
rows | 'Assign Side Columns' >> beam.Map(assign_columns, mapping=beam.pvalue.AsIter(side_input))
| 'Rename column' >> beam.Map(renames_columns, names={'site': 'campaign'})
| 'TransformForBigQuery' >> beam.ParDo(TransformForBigQueryFn())
)
gcp_options = options.view_as(GoogleCloudOptions)
(
mapped_rows
| 'WriteToBigQueryUsingParameter' >> bigquery_file_loads.BigQueryBatchFileLoads(
destination=_get_table_name,
schema=_get_schema,
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
additional_bq_parameters=_get_additional_bq_parameters,
custom_gcs_temp_location=gcp_options.temp_location
)
)

Related

I want to pass side input AsDIct but getting error "ValueError: dictionary update sequence element #0 has length 101; 2 is required"

class load_side_input(beam.DoFn):
def process(self,pubsub_message):
message = pubsub_message.decode("utf8")
output:typing.Dict={}
for key in message.keys():
output[key] = self.tag_model[key]
return [output]
side_input = (p
| "AMM Events" >> beam.io.ReadFromPubSub(subscription=opts.ammSub)
| "Trigger event" >> beam.WindowInto(window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)),
accumulation_mode=trigger.AccumulationMode.DISCARDING)
| "Parse and Update Cache" >> beam.ParDo(load_side_input())
)
enrichment = (rows
| 'Data Validation and Enrichment' >> beam.ParDo(validation(),y_side=AsDict(side_input))
)
File "/usr/local/lib/python3.9/site-packages/apache_beam/runners/worker/bundle_processor.py", line 434, in __getitem__
self._cache[target_window] = self._side_input_data.view_fn(raw_view)
ValueError: dictionary update sequence element #0 has length 101; 2 is required [while running 'Data Enrichment-ptransform-128']

You feed the function beam.pvalue.AsDict the incorrect input format. According to the documentation:
Parameters: pcoll – Input pcollection. All elements should be key-value pairs (i.e. 2-tuples) with unique keys.
Here is a minimum working example, which can be run at Apache Play
import apache_beam as beam
def use_side_input(main, side_input):
return side_input[main]
class BadSideInputCreator(beam.DoFn):
def process(self, element):
output = {}
output['1'] = 'value1'
output['2'] = 'value2'
yield [output] # this is a list of an dict and not a 2-tuple
class GoodSideInputCreator(beam.DoFn):
def process(self, element):
output = {}
output['1'] = 'value1'
output['2'] = 'value2'
for key, value in output.items():
yield (key, value) # this is a 2-tuple
with beam.Pipeline() as pipeline:
main = (
pipeline
| "init main" >> beam.Create(['1', '2'])
)
side = (
pipeline
| "init side" >> beam.Create(['dummy'])
| beam.ParDo(BadSideInputCreator()) # replace with GoodSideInputCreator
)
(
main
| "use side input" >> beam.Map(use_side_input, side_input=beam.pvalue.AsDict(side))
| "print" >> beam.Map(print)
)
Running with BadSideInputCreator throws your error
ValueError: dictionary update sequence element #0 has length 1; 2 is required
while with GoodSideInputCreator we get the expected result
value1
value2

Using Pyspark how to convert plain text to csv file

When I created a hive table, the data is as follows.
data file
<__name__>abc
<__code__>1
<__value__>1234
<__name__>abcdef
<__code__>2
<__value__>12345
<__name__>abcdef
<__code__>2
<__value__>12345
1234156321
<__name__>abcdef
<__code__>2
<__value__>12345
...
Can I create a table right away without converting the file?
It's a plain text file, three columns are repeated.
How to convert dataframe? or csv file?
I want
| name | code | value
| abc | 1 | 1234
| abcdef | 2 | 12345
...
or
abc,1,1234
abcdef,2,12345
...

I solved my problem like this.
data = spark.read.text(path)
rows = data.rdd.zipWithIndex().map(lambda x: Row(x[0].value, int(x[1]/3)))
schema = StructType() \
.add("col1",StringType(), False) \
.add("record_pos",IntegerType(), False)
df = spark.createDataFrame(rows, schema)
df1 = df.withColumn("key", regexp_replace(split(df["col1"], '__>')[0], '<|__', '')) \
.withColumn("value", regexp_replace(regexp_replace(split(df["col1"], '__>')[1], '\n', '<NL>'), '\t', '<TAB>'))
dataframe = df1.groupBy("record_pos").pivot("key").agg(first("value")).drop("record_pos")
dataframe.show()

val path = "file:///C:/stackqustions/data/stackq5.csv"
val data = sc.textFile(path)
import spark.implicits._
val rdd = data.zipWithIndex.map {
case (records, index) => Row(records, index / 3)
}
val schema = new StructType().add("col1", StringType, false).add("record_pos", LongType, false)
val df = spark.createDataFrame(rdd, schema)
val df1 = df
.withColumn("key", regexp_replace(split($"col1", ">")(0), "<|__", ""))
.withColumn("value", split($"col1", ">")(1)).drop("col1")
df1.groupBy("record_pos").pivot("key").agg(first($"value")).drop("record_pos").show
result:
+----+------+-----+
|code| name|value|
+----+------+-----+
| 1| abc| 1234|
| 2|abcdef|12345|
| 2|abcdef|12345|
| 2|abcdef|12345|
+----+------+-----+

Use Pyspark df.select() to stack the output

I have the following code: . .
base_curr = 'BTC,XRP,ETH'
uri = f"https://min-api.cryptocompare.com/data/pricemultifull?fsyms={base_curr}&tsyms=JPY"
r = requests.get(uri)rdd = sc.parallelize([r.text])
json_df = sqlContext.read.json(rdd)
json_df = json_df.select(
col('RAW.BTC.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.BTC.JPY.PRICE').alias('PRICE'),
F.col('RAW.BTC.JPY.LASTUPDATE').alias('LASTUPDATE'),
F.col('RAW.XRP.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.XRP.JPY.PRICE').alias('PRICE'),
F.col('RAW.XRP.JPY.LASTUPDATE').alias('LASTUPDATE'),
F.col('RAW.ETH.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.ETH.JPY.PRICE').alias('PRICE'),
F.col('RAW.ETH.JPY.LASTUPDATE').alias('LASTUPDATE')
)
print(json_df)
json_df.show()
json_df.printSchema()
The output is:
+----------+----------+----------+----------+-----+----------+----------+--------+----------+
|FROMSYMBOL| PRICE|LASTUPDATE|FROMSYMBOL|PRICE|LASTUPDATE|FROMSYMBOL| PRICE|LASTUPDATE|
+----------+----------+----------+----------+-----+----------+----------+--------+----------+
| BTC|1994390.75|1607158504| XRP|61.85|1607158490| ETH|61874.24|1607158509|
+----------+----------+----------+----------+-----+----------+----------+--------+----------+
But the desired output is:
+----------+----------+----------+
|FROMSYMBOL| PRICE|LASTUPDATE|
+----------+----------+----------+
| BTC|1994390.75|1607158504|
+----------+----------+----------+
| XRP|61.85 |1607158490|
+----------+----------+----------+
| ETH|61874.24 |1607158509|
+----------+----------+----------+
Any help would be so great. I tried several method without success.

Does this work for you?
base_curr = 'BTC,XRP,ETH'
uri = f"https://min-api.cryptocompare.com/data/pricemultifull?fsyms={base_curr}&tsyms=JPY"
r = requests.get(uri)
rdd = sc.parallelize([r.text])
json_df = sqlContext.read.json(rdd)
json_df = json_df.select(
F.explode(
F.array(
F.array(F.col('RAW.BTC.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.BTC.JPY.PRICE').alias('PRICE'),
F.col('RAW.BTC.JPY.LASTUPDATE').alias('LASTUPDATE')
),
F.array(F.col('RAW.XRP.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.XRP.JPY.PRICE').alias('PRICE'),
F.col('RAW.XRP.JPY.LASTUPDATE').alias('LASTUPDATE')
),
F.array(F.col('RAW.ETH.JPY.FROMSYMBOL').alias('FROMSYMBOL'),
F.col('RAW.ETH.JPY.PRICE').alias('PRICE'),
F.col('RAW.ETH.JPY.LASTUPDATE').alias('LASTUPDATE')
)
)
).alias('arrays')
).select(
F.col('arrays')[0].alias('FROMSYMBOL'),
F.col('arrays')[1].alias('PRICE'),
F.col('arrays')[2].alias('LASTUPDATE'),
)

GCP Dataflow Apache Beam code logic not working as expected

I am trying to implement a CDC in Apache Beam, deployed in Google Cloud Dataflow.
I have unloaded the master data and the new data, which is expected to coming daily.
The join is not working as expected. Something is missing.
master_data = (
p | 'Read base from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=master_data, use_standard_sql=True))
| 'Map id in master' >> beam.Map(lambda master: (
master['id'], master)))
new_data = (
p | 'Read Delta from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=new_data, use_standard_sql=True))
| 'Map id in new' >> beam.Map(lambda new: (new['id'], new)))
joined_dicts = (
{'master_data' :master_data, 'new_data' : new_data }
| beam.CoGroupByKey()
| beam.FlatMap(join_lists)
| 'mergeddicts' >> beam.Map(lambda masterdict, newdict: newdict.update(masterdict))
)
def join_lists(k,v):
itertools.product(v['master_data'], v['new_data'])
Observations (on sample data):
Data from the master
1, 'A',3232
2, 'B',234
New Data:
1,'A' ,44
4,'D',45
Expected result in master table, post the code implementation:
1, 'A',44
2, 'B',234
4,'D',45
However, what I am getting in master table is:
1,'A' ,44
4,'D',45
Am I missing a step? Can anyone please assist in rectifying my mistake.

You don't need to flatten after group by as it separates the elements again.
Here is the sample code.
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam as beam
def join_lists(e):
(k,v)=e
return (k, v['new_data']) if v['new_data'] != v['master_data'] else (k, None)
with beam.Pipeline(options=PipelineOptions()) as p:
master_data = (
p | 'Read base from BigQuery ' >> beam.Create([('A', [3232]),('B', [234])])
)
new_data = (
p | 'Read Delta from BigQuery ' >> beam.Create([('A',[44]),('D',[45])])
)
joined_dicts = (
{'master_data' :master_data, 'new_data' : new_data }
| beam.CoGroupByKey()
| 'mergeddicts' >> beam.Map(join_lists)
)
result = p.run()
result.wait_until_finish()
print("Pipeline finished.")

Creating a formatted table from a list of tuples

I have a list of data that looks something like this:
[
(
1,
u'python -c \'print("ok")\'',
u'data',
u'python'
), (
2,
u'python -c \'print("this is some data")\'',
u'data',
u'python'
)
]
This data is taken out of a database and displayed as this, and is continuously growing. What I would like to do is display the data like this:
Language | Type | Payload
-------------------------------
python | data | python -c 'print("ok")'
python | data | python -c 'print("this is some data")'
I have a function that kinda does the same thing, but it's not exactly as expected:
def print_table(data, cols, width):
n, r = divmod(len(data), cols)
pattern = "{{:{}}}".format(width)
line = "\n".join(pattern * cols for _ in range(n))
last_line = pattern * r
print(line.format(*data))
print(last_line.format(*data[n*cols]))
How can I get the output of my data to look as wanted? From the answers it's possible with pandas but I would also like a way to do it without installing external modules

Analyze the data for its max-width and use string formatting - some 'creative' formatting later:
data = [
(
1,
u'python -c \'print("ok")\'',
u'data',
u'python'
), (
2,
u'python -c \'print("this is some data")\'',
u'data',
u'python'
)
]
def print_table(data):
widths = {0:0, 3:len("Language"),2:len("Type"),1:len("Payload")}
for k in data:
for i,d in enumerate(k):
widths[i] = max(widths[i],len(str(d)))
# print(widths)
lan, typ, pay = ["Language","Type","Payload"]
print(f"{lan:<{widths[3]}} | {typ:<{widths[2]}} | {pay:<{widths[1]}}")
# adjust by 10 for ' | ' twice
print("-" * (widths[1]+widths[2]+widths[3]+10))
for k in data:
_, pay, typ, lan = k
print(f"{lan:<{widths[3]}} | {typ:<{widths[2]}} | {pay:<{widths[1]}}")
Output:
Language | Type | Payload
------------------------------------------------------------
python | data | python -c 'print("ok")'
python | data | python -c 'print("this is some data")'
string format mini language
Equivalent Python 2.7 code:
# w == widths - would break 79 chars/line else wise
def print_table(data):
w = {0:0, 3:len("Language"),2:len("Type"),1:len("Payload")}
for k in data:
for i,d in enumerate(k):
w[i] = max(w[i],len(str(d)))
lan, typ, pay = ["Language","Type","Payload"]
print "{:<{}} | {:<{}} | {:<{}}".format(lan, w[3], typ, w[2], pay, w[1])
print "-" * (w[1]+w[2]+w[3]+10)
for k in data:
_, pay, typ, lan = k
print "{:<{}} | {:<{}} | {:<{}}".format(lan, w[3], typ, w[2], pay, w[1])

Solution that handles any number of columns:
from operator import itemgetter
data = [
('ID', 'Payload', 'Type', 'Language'),
(1, u'python -c \'print("ok")\'', u'data', u'python'),
(2, u'python -c \'print("this is some data")\'', u'data', u'python')
]
def print_table(data):
lengths = [
[len(str(x)) for x in row]
for row in data
]
max_lengths = [
max(map(itemgetter(x), lengths))
for x in range(0, len(data[0]))
]
format_str = ''.join(map(lambda x: '%%-%ss | ' % x, max_lengths))
print(format_str % data[0])
print('-' * (sum(max_lengths) + len(max_lengths) * 3 - 1))
for x in data[1:]:
print(format_str % x)
print_table(data)
Output:
$ python table.py
ID | Payload | Type | Language |
---------------------------------------------------------------
1 | python -c 'print("ok")' | data | python |
2 | python -c 'print("this is some data")' | data | python |

You can use pandas for that:
import pandas as pd
data = pd.DataFrame(a, columns=['id','Payload', 'type', 'Language'])
print(data)
gives you:
id Payload type Language
0 1 python -c 'print("ok")' data python
1 2 python -c 'print("this is some data")' data python

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

"Can't pickle generator objects" in Apache Beam - Python - python

Related

I want to pass side input AsDIct but getting error "ValueError: dictionary update sequence element #0 has length 101; 2 is required"

Using Pyspark how to convert plain text to csv file

Use Pyspark df.select() to stack the output

GCP Dataflow Apache Beam code logic not working as expected

Creating a formatted table from a list of tuples

Categories

Resources