How to make stream pipeline pubsub to datastore with python? - python

I try to post json file to pubsub and write to datastore with cloud Dataflow in streaming process.
from __future__ import absolute_import
import apache_beam as beam
import json
import logging
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from google.cloud.proto.datastore.v1 import entity_pb2
from apache_beam import window
from apache_beam.io.gcp.pubsub import ReadFromPubSub
from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore
from googledatastore import helper as datastore_helper
class EntityWrapper(object):
def __init__(self, namespace, kind, ancestor):
self._namespace = namespace
self._kind = kind
self._ancestor = ancestor
def make_entity(self, content):
entity = entity_pb2.Entity()
if self._namespace is not None:
entity.key.partition_id.namespace_id = self._namespace
datastore_helper.add_key_path(entity.key, self._kind, self._ancestor, self._kind, str(uuid.uuid4()))
datastore_helper.add_properties(entity, {"content": unicode(content)})
return entity
pipeline_options = {
'project': PROJECT,
'staging_location': STAGING_LOCATION,
'runner': 'DataflowRunner',
'job_name': JOB_NAME,
'temp_location': TEMP_LOCATION,
'streaming': True}
options = PipelineOptions.from_dictionary(pipeline_options)
def run():
p = beam.Pipeline(options=options)
def parse_pubsub(line):
record = json.loads(line)
return record
(p | "Read from PubSub" >> ReadFromPubSub(topic=TOPIC)
| "PubSub message to Python object" >> beam.Map(parse_pubsub)
| "Windowing" >> beam.WindowInto(window.FixedWindows(10))
| "create entity" >> beam.Map(EntityWrapper(namespace=NAMESPACE, kind=KIND, ancestor=None).make_entity)
| "write to DataStore" >> WriteToDatastore(PROJECT))
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
When I run this code on google cloud shell, it is able to run and make pipeline such like this.
But, when I post json to pubsub, it doesn't work.
the error message is below.
JOB_MESSAGE_ERROR: java.util.concurrent.ExecutionException: java.lang.RuntimeException: Error received from SDK harness for instruction -30: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 134, in _execute
response = task()
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 169, in <lambda>
self._execute(lambda: worker.do_instruction(work), work)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 215, in do_instruction
request.instruction_id)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 237, in process_bundle
processor.process_bundle(instruction_id)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/bundle_processor.py", line 299, in process_bundle
input_op.process_encoded(data.data)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/bundle_processor.py", line 120, in process_encoded
self.output(decoded_value)
File "apache_beam/runners/worker/operations.py", line 166, in apache_beam.runners.worker.operations.Operation.output
def output(self, windowed_value, output_index=0):
File "apache_beam/runners/worker/operations.py", line 167, in apache_beam.runners.worker.operations.Operation.output
cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 87, in apache_beam.runners.worker.operations.ConsumerSet.receive
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 387, in apache_beam.runners.worker.operations.DoOperation.process
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 388, in apache_beam.runners.worker.operations.DoOperation.process
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 589, in apache_beam.runners.common.DoFnRunner.receive
self.process(windowed_value)
File "apache_beam/runners/common.py", line 595, in apache_beam.runners.common.DoFnRunner.process
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 612, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise
File "apache_beam/runners/common.py", line 593, in apache_beam.runners.common.DoFnRunner.process
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 363, in apache_beam.runners.common.SimpleInvoker.invoke_process
output_processor.process_outputs(
File "apache_beam/runners/common.py", line 698, in apache_beam.runners.common._OutputProcessor.process_outputs
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 87, in apache_beam.runners.worker.operations.ConsumerSet.receive
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 387, in apache_beam.runners.worker.operations.DoOperation.process
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 388, in apache_beam.runners.worker.operations.DoOperation.process
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 589, in apache_beam.runners.common.DoFnRunner.receive
self.process(windowed_value)
File "apache_beam/runners/common.py", line 595, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 612, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise
File "apache_beam/runners/common.py", line 593, in apache_beam.runners.common.DoFnRunner.process
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 472, in apache_beam.runners.common.PerWindowInvoker.invoke_process
self._invoke_per_window(
File "apache_beam/runners/common.py", line 522, in apache_beam.runners.common.PerWindowInvoker._invoke_per_window
output_processor.process_outputs(
File "apache_beam/runners/common.py", line 659, in apache_beam.runners.common._OutputProcessor.process_outputs
def process_outputs(self, windowed_input_element, results):
File "apache_beam/runners/common.py", line 698, in apache_beam.runners.common._OutputProcessor.process_outputs
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 87, in apache_beam.runners.worker.operations.ConsumerSet.receive
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 387, in apache_beam.runners.worker.operations.DoOperation.process
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 388, in apache_beam.runners.worker.operations.DoOperation.process
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 589, in apache_beam.runners.common.DoFnRunner.receive
self.process(windowed_value)
File "apache_beam/runners/common.py", line 595, in apache_beam.runners.common.DoFnRunner.process
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 612, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise
File "apache_beam/runners/common.py", line 593, in apache_beam.runners.common.DoFnRunner.process
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 364, in apache_beam.runners.common.SimpleInvoker.invoke_process
windowed_value, self.process_method(windowed_value.value))
File "/home/shinya_yaginuma/.local/lib/python2.7/site-packages/apache_beam/transforms/core.py", line 1035, in <lambda>
File "pubsub_to_datastore.py", line 21, in make_entity
NameError: global name 'entity_pb2' is not defined
java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895)
org.apache.beam.sdk.util.MoreFutures.get(MoreFutures.java:57)
com.google.cloud.dataflow.worker.fn.control.RegisterAndProcessBundleOperation.finish(RegisterAndProcessBundleOperation.java:274)
com.google.cloud.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:83)
com.google.cloud.dataflow.worker.fn.control.BeamFnMapTaskExecutor.execute(BeamFnMapTaskExecutor.java:101)
com.google.cloud.dataflow.worker.StreamingDataflowWorker.process(StreamingDataflowWorker.java:1227)
com.google.cloud.dataflow.worker.StreamingDataflowWorker.access$1000(StreamingDataflowWorker.java:136)
com.google.cloud.dataflow.worker.StreamingDataflowWorker$6.run(StreamingDataflowWorker.java:966)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
java.lang.Thread.run(Thread.java:745)
I check the all libraris are installed.
So, I can't understand why the error occur.
Regards,

So your imports are working fine, hence the error occurring when you make the pubsub, not on DF creation. However, entity_pb2 is disappearing when your make_entity actually gets called!
According to the docs, you need the import on the worker that actually gets used, or you can make your imports persistent. Try saving your main session:
pipeline_options = {
'project': PROJECT,
'staging_location': STAGING_LOCATION,
'runner': 'DataflowRunner',
'job_name': JOB_NAME,
'temp_location': TEMP_LOCATION,
'streaming': True,
'save_main_session': True} #

Related

Apache Beam infer schema using NamedTuple (Python)

I am quite new to apache beam and I am wondering how to infer schema to a pcollection using namedtuple.
The example from the documentation Programming Guide states:
class Transaction(typing.NamedTuple):
bank: str
purchase_amount: float
pc = input | beam.Map(lambda ...).with_output_types(Transaction)
I tried to implement similar thing but reading from a parquet file first
from apache_beam import coders
from typing import NamedTuple
import apache_beam as beam
class TestSchema(NamedTuple):
company_id: int
is_company: bool
company_created_datetime: str
company_values: str
if __name__ == '__main__':
coders.registry.register_coder(TestSchema, coders.RowCoder)
with beam.Pipeline() as pipeline:
record = pipeline | "Read Parquet" >> beam.io.ReadFromParquet("test.parquet").with_output_types(TestSchema) \
| "Print" >> beam.Map(print)
pipeline.run().wait_until_finish()
And I am getting AttributeError: 'dict' object has no attribute 'company_id' [while running 'Read Parquet/ParDo(_ArrowTableToRowDictionaries)']
Also without the .with_output_types(TestSchema) I can see the data fine which looks like this
{'company_id': 3, 'is_company': True, 'company_created_datetime': datetime.datetime(2022, 3, 8, 13, 2, 26, 573511), 'company_values': 'test value'}
I am using python 3.8 and beam 2.37.0
Am I missing something? any help would be appreciated (stack trace below).
Traceback (most recent call last):
File "apache_beam/runners/worker/operations.py", line 346, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 348, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 817, in apache_beam.runners.worker.operations.SdfProcessSizedElements.process
File "apache_beam/runners/worker/operations.py", line 826, in apache_beam.runners.worker.operations.SdfProcessSizedElements.process
File "apache_beam/runners/common.py", line 1206, in apache_beam.runners.common.DoFnRunner.process_with_sized_restriction
File "apache_beam/runners/common.py", line 698, in apache_beam.runners.common.PerWindowInvoker.invoke_process
File "apache_beam/runners/common.py", line 836, in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "apache_beam/runners/common.py", line 1361, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 707, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 708, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 1200, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1281, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 1198, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 718, in apache_beam.runners.common.PerWindowInvoker.invoke_process
File "apache_beam/runners/common.py", line 841, in apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "apache_beam/runners/common.py", line 1361, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 214, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 178, in apache_beam.runners.worker.operations.ConsumerSet.update_counters_start
File "apache_beam/runners/worker/opcounters.py", line 211, in apache_beam.runners.worker.opcounters.OperationCounters.update_from
File "apache_beam/runners/worker/opcounters.py", line 250, in apache_beam.runners.worker.opcounters.OperationCounters.do_sample
File "apache_beam/coders/coder_impl.py", line 1425, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
File "apache_beam/coders/coder_impl.py", line 1436, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
File "apache_beam/coders/coder_impl.py", line 207, in apache_beam.coders.coder_impl.CoderImpl.get_estimated_size_and_observables
File "apache_beam/coders/coder_impl.py", line 246, in apache_beam.coders.coder_impl.StreamCoderImpl.estimate_size
File "apache_beam/coders/coder_impl.py", line 1610, in apache_beam.coders.coder_impl.RowCoderImpl.encode_to_stream
AttributeError: 'dict' object has no attribute 'company_id' [while running 'Read Parquet/ParDo(_ArrowTableToRowDictionaries)']
Ok after some research on beam schema and digging in the source code I finally found the solution. It looks like you need to convert every single value in the pcollection to NamedTuple and later apply a type hint.
with beam.Pipeline() as pipeline:
record = pipeline | "Read Parquet" >> beam.io.ReadFromParquet("test.parquet") \
| "Transform to NamedTuple" beam.Map(lambda x: TestSchema(**x)).with_output_types(TestSchema) \
| "Print" >> beam.Map(print)

Python Bigtable Dataflow - Can't pickle <class 'Mutation'>

I'm writing a Dataflow pipeline using Apache beam to add large batches of rows of data to Bigtable.
apache-beam==2.24.0
google-cloud-bigtable==2.4.0
I have the following method used in my pipeline to create the Bigtable row(s) prior to writing to Bigtable:
class CreateBigtableRow(beam.DoFn):
def __init__(self, settings):
self.column_family = settings["bigtable_column_family"]
super(CreateBigtableRow, self).__init__()
def process(self, usage_data, *args, **kwargs):
row_key = BigTable.generate_row_key(usage_data, key_order)
return [
BigTable.create_row_and_assign_values(
row_key, usage_data, self.column_family
)
]
where `create_row_and_assign_values is defined as:
def create_row_and_assign_values(
cls, key: str, row: dict, column_family: str
) -> DirectRow:
table_row = DirectRow(key.encode())
for key, val in row.items():
if isinstance(val, float):
val = struct.pack(">d", val)
table_row.set_cell(column_family, key.encode(), val)
return table_row
My pipeline is as follows:
with beam.Pipeline(options=pipeline_options) as pipe:
(
pipe
| beam.Create(["/sample_files/*combined*"]) # reads sample csv file
| fileio.MatchAll()
| fileio.ReadMatches()
| beam.FlatMap(
lambda file: csv.DictReader(open(file.metadata.path))
)
| "Transform to Usage dict" >> beam.ParDo(TransformToBigtableData())
| "Create Bigtable Row" >> beam.ParDo(CreateBigtableRow(bigtable_settings))
| WriteToBigTable(
project_id=bigtable_settings["bigtable_project"],
instance_id=bigtable_settings["bigtable_instance"],
table_id=bigtable_settings["bigtable_table"])
)
The problem I'm having is I get the error
_pickle.PicklingError: Can't pickle <class 'Mutation'>: attribute lookup Mutation on __main__ failed [while running 'Create Bigtable Row']
when running the pipeline. I've added steps to manually batch process the records by using the google-cloud-bigtable library's Bigtable Client, but would prefer to use the build-in WriteToBigTable method as it handles everything for me.
Full stack trace:
Traceback (most recent call last):
File "/app/src/ingest/main.py", line 226, in <module>
run(
File "/app/src/ingest/main.py", line 149, in run
(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/pipeline.py", line 596, in __exit__
self.result = self.run()
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/pipeline.py", line 546, in run
return Pipeline.from_runner_api(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/pipeline.py", line 573, in run
return self.runner.run_pipeline(self, self._options)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/direct/direct_runner.py", line 131, in run_pipeline
return runner.run_pipeline(pipeline, options)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 195, in run_pipeline
self._latest_run_result = self.run_via_runner_api(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 206, in run_via_runner_api
return self.run_stages(stage_context, stages)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 384, in run_stages
stage_results = self._run_stage(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 646, in _run_stage
self._run_bundle(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 769, in _run_bundle
result, splits = bundle_manager.process_bundle(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 1080, in process_bundle
result_future = self._worker_handler.control_conn.push(process_bundle_req)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/worker_handlers.py", line 378, in push
response = self.worker.do_instruction(request)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/sdk_worker.py", line 597, in do_instruction
return getattr(self, request_type)(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/sdk_worker.py", line 635, in process_bundle
bundle_processor.process_bundle(instruction_id))
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/bundle_processor.py", line 995, in process_bundle
input_op_by_transform_id[element.transform_id].process_encoded(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/bundle_processor.py", line 221, in process_encoded
self.output(decoded_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 354, in output
cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 216, in receive
self.consumer.process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 714, in process
delayed_applications = self.dofn_runner.process(o)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1235, in process
self._reraise_augmented(exn)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1233, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 571, in invoke_process
self.output_processor.process_outputs(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1396, in process_outputs
self.main_receivers.receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 216, in receive
self.consumer.process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 714, in process
delayed_applications = self.dofn_runner.process(o)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1235, in process
self._reraise_augmented(exn)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1233, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 571, in invoke_process
self.output_processor.process_outputs(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1396, in process_outputs
self.main_receivers.receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 216, in receive
self.consumer.process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 714, in process
delayed_applications = self.dofn_runner.process(o)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1235, in process
self._reraise_augmented(exn)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1233, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 571, in invoke_process
self.output_processor.process_outputs(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1396, in process_outputs
self.main_receivers.receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 216, in receive
self.consumer.process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 714, in process
delayed_applications = self.dofn_runner.process(o)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1235, in process
self._reraise_augmented(exn)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1233, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 571, in invoke_process
self.output_processor.process_outputs(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1396, in process_outputs
self.main_receivers.receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 216, in receive
self.consumer.process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 714, in process
delayed_applications = self.dofn_runner.process(o)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1235, in process
self._reraise_augmented(exn)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1233, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 571, in invoke_process
self.output_processor.process_outputs(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1396, in process_outputs
self.main_receivers.receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 216, in receive
self.consumer.process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 714, in process
delayed_applications = self.dofn_runner.process(o)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1235, in process
self._reraise_augmented(exn)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1233, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 571, in invoke_process
self.output_processor.process_outputs(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1396, in process_outputs
self.main_receivers.receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 216, in receive
self.consumer.process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 714, in process
delayed_applications = self.dofn_runner.process(o)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1235, in process
self._reraise_augmented(exn)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1316, in _reraise_augmented
raise new_exn.with_traceback(tb)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1233, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 571, in invoke_process
self.output_processor.process_outputs(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1396, in process_outputs
self.main_receivers.receive(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 215, in receive
self.update_counters_start(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 179, in update_counters_start
self.opcounter.update_from(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/opcounters.py", line 211, in update_from
self.do_sample(windowed_value)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/runners/worker/opcounters.py", line 250, in do_sample
self.coder_impl.get_estimated_size_and_observables(windowed_value))
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/coders/coder_impl.py", line 1371, in get_estimated_size_and_observables
self._value_coder.get_estimated_size_and_observables(
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/coders/coder_impl.py", line 358, in get_estimated_size_and_observables
self.encode_to_stream(value, out, nested)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/coders/coder_impl.py", line 422, in encode_to_stream
self.fallback_coder_impl.encode_to_stream(value, stream, nested)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/coders/coder_impl.py", line 262, in encode_to_stream
return stream.write(self._encoder(value), nested)
File "/opt/pysetup/.venv/lib/python3.9/site-packages/apache_beam/coders/coders.py", line 800, in <lambda>
lambda x: dumps(x, protocol), pickle.loads)
_pickle.PicklingError: Can't pickle <class 'Mutation'>: attribute lookup Mutation on __main__ failed [while running 'Create Bigtable Row']
Your google-cloud-bigtable version is too high.
There is some movement in updating apache-beam dependencies here
They have the same issue. Can you roll back your bigtable version to something before 2? If you run this:
pip install apache-beam[gcp]
It'll install the recommended version.

in process NameError: name 'monitoring_v3' is not defined [while running 'PubSub Unread Messages-ptransform-32']

Not able to run the dataflow for streaming and getting the count of unread messages
I needed a streaming dataflow to monitor the unread messagaes
Is there any issue in my code.
import logging
import datetime
import argparse
import yaml
import apache_beam as beam
from apache_beam import Pipeline, ParDo, DoFn
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import monitoring_v3
from google.cloud.monitoring_v3 import query
class unread(DoFn):
def __init__(self, project, pubsub_subscription):
self.project = project
self.pubsub_subscription = pubsub_subscription
def process(self, key_value):
logging.info("Key_value {}".format(key_value))
client = monitoring_v3.MetricServiceClient()
result = query.Query(
client,
project,
'pubsub.googleapis.com/subscription/num_undelivered_messages',
end_time=datetime.datetime.now(),
minutes=2,
).as_dataframe()
if result.empty:
logging.info("All Messages Read")
else:
logging.info(result['pubsub_subscription'][self.project][self.pubsub_subscription][0])
return result
def run():
pipeline_args = [arguments passed
]
logging.info("Pipeline Arguments: {}".format(pipeline_args))
try:
# Set `save_main_session` to True so DoFns can access globally imported modules.
pipeline_options = PipelineOptions(
pipeline_args, streaming=True, save_main_session=True
)
with Pipeline(options=pipeline_options) as pipeline:
(
pipeline
| 'Initializing ' >> beam.Create(['1'])
| 'PubSub Unread Messages' >> ParDo(unread(project, pubsub_subscription))
)
except Exception as e:
logging.error("Failed to Deploy DataFlow Pipeline. Error {}".format(e))
if __name__ == "__main__":
with open("input.yaml", 'r') as yamlfile:
cfg = yaml.load(yamlfile, Loader=yaml.BaseLoader)
logging.basicConfig(format='%(asctime)s %(levelname)s - %(message)s', datefmt='%y-%m-%d %H:%M:%S', level=logging.INFO)
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args()
run()
Error message:
Error message from worker: generic::unknown: Traceback (most recent call last): File "apache_beam/runners/common.py", line 1233, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 582, in apache_beam.runners.common.SimpleInvoker.invoke_process File "unread.py", line 18, in process NameError: name 'monitoring_v3' is not defined During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 289, in _execute response = task() File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 362, in lambda: self.create_worker().do_instruction(request), request) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 607, in do_instruction getattr(request, request_type), request.instruction_id) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 644, in process_bundle bundle_processor.process_bundle(instruction_id)) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 1001, in process_bundle element.data) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 229, in process_encoded self.output(decoded_value) File "apache_beam/runners/worker/operations.py", line 356, in apache_beam.runners.worker.operations.Operation.output File "apache_beam/runners/worker/operations.py", line 358, in apache_beam.runners.worker.operations.Operation.output File "apache_beam/runners/worker/operations.py", line 220, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive File "apache_beam/runners/worker/operations.py", line 717, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/worker/operations.py", line 718, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/common.py", line 1235, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 1300, in apache_beam.runners.common.DoFnRunner._reraise_augmented File "apache_beam/runners/common.py", line 1233, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 581, in apache_beam.runners.common.SimpleInvoker.invoke_process File "apache_beam/runners/common.py", line 1395, in apache_beam.runners.common._OutputProcessor.process_outputs File "apache_beam/runners/worker/operations.py", line 220, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive File "apache_beam/runners/worker/operations.py", line 717, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/worker/operations.py", line 718, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/common.py", line 1235, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 1300, in apache_beam.runners.common.DoFnRunner._reraise_augmented File "apache_beam/runners/common.py", line 1233, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 581, in apache_beam.runners.common.SimpleInvoker.invoke_process File "apache_beam/runners/common.py", line 1395, in apache_beam.runners.common._OutputProcessor.process_outputs File "apache_beam/runners/worker/operations.py", line 220, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive File "apache_beam/runners/worker/operations.py", line 717, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/worker/operations.py", line 718, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/common.py", line 1235, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 1315, in apache_beam.runners.common.DoFnRunner._reraise_augmented File "/usr/local/lib/python3.7/site-packages/future/utils/init.py", line 446, in raise_with_traceback raise exc.with_traceback(traceback) File "apache_beam/runners/common.py", line 1233, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 582, in apache_beam.runners.common.SimpleInvoker.invoke_process File "unread.py", line 18, in process NameError: name 'monitoring_v3' is not defined [while running 'PubSub Unread Messages-ptransform-32']
See how to handle name error here.
Try importing the module inside the function, or enable the --save_main_session pipeline option.
The default python dependencies that are installed on the workers are lister here -
https://cloud.google.com/dataflow/docs/concepts/sdk-worker-dependencies#python-3.7.10
To install additional packages please follow the steps as mentioned here - https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/

When i run 'pip install scipy' on my Pycharm Terminal i get Attribute Error

i am trying to install scipy package on one of my Pycharm projects. i get the following Attribute Error,
AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'text_temp'
following is the exception that i am getting,
Collecting scipy
Exception:
Traceback (most recent call last):
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/cli/base_command.py", line 179, in main
status = self.run(options, args)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/commands/install.py", line 315, in run
resolver.resolve(requirement_set)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/resolve.py", line 131, in resolve
self._resolve_one(requirement_set, req)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/resolve.py", line 294, in _resolve_one
abstract_dist = self._get_abstract_dist_for(req_to_install)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/resolve.py", line 242, in _get_abstract_dist_for
self.require_hashes
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/operations/prepare.py", line 269, in prepare_linked_requirement
req.populate_link(finder, upgrade_allowed, require_hashes)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/req/req_install.py", line 196, in populate_link
self.link = finder.find_requirement(self, upgrade)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/index.py", line 639, in find_requirement
all_candidates = self.find_all_candidates(req.name)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/index.py", line 614, in find_all_candidates
self._package_versions(page.iter_links(), search)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/index.py", line 775, in _package_versions
for link in self._sort_links(links):
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/index.py", line 759, in _sort_links
for link in links:
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_internal/index.py", line 971, in iter_links
namespaceHTMLElements=False,
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py", line 47, in parse
return p.parse(doc, **kwargs)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py", line 289, in parse
self._parse(stream, False, None, *args, **kwargs)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py", line 134, in _parse
self.mainLoop()
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py", line 239, in mainLoop
new_token = phase.processSpaceCharacters(new_token)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/html5parser.py", line 469, in processSpaceCharacters
self.tree.insertText(token["data"])
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/treebuilders/base.py", line 357, in insertText
parent.insertText(data)
File "/Users/rohitgite/PycharmProjects/HelloWorld/venv/lib/python3.7/site-packages/pip-19.0.3-py3.7.egg/pip/_vendor/html5lib/treebuilders/etree.py", line 109, in insertText
if not self._element.text_temp:
AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'text_temp'

How to create a Dataflow pipeline from Pub/Sub to GCS in Python

I want to use Dataflow to move data from Pub/Sub to GCS.
So basically I want Dataflow to accumulate some messages in a fixed amount of time (15 minutes for example), then write those data as text file into GCS when that amount of time has passed.
My final goal is to create a custom pipeline, so "Pub/Sub to Cloud Storage" template is not enough for me, and I don't know about Java at all, which made me start to tweak in Python.
Here is what I have got as of now (Apache Beam Python SDK 2.10.0):
import apache_beam as beam
TOPIC_PATH="projects/<my-project>/topics/<my-topic>"
def CombineFn(e):
return "\n".join(e)
o = beam.options.pipeline_options.PipelineOptions()
p = beam.Pipeline(options=o)
data = ( p | "Read From Pub/Sub" >> beam.io.ReadFromPubSub(topic=TOPIC_PATH)
| "Window" >> beam.WindowInto(beam.window.FixedWindows(30))
| "Combine" >> beam.transforms.core.CombineGlobally(CombineFn).without_defaults()
| "Output" >> beam.io.WriteToText("<GCS path or local path>"))
res = p.run()
res.wait_until_finish()
I ran this program without problems in local environment.
python main.py
It runs locally but read from specified Pub/Sub topic and writes to the specified GCS path every time the 30 seconds has passed, as expected.
The problem now is, however, when I run this on the Google Cloud Platform, namely Cloud Dataflow, it continuously emits mysterious Exception.
java.util.concurrent.ExecutionException: java.lang.RuntimeException: Error received from SDK harness for instruction -1096: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 148, in _execute
response = task()
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 183, in <lambda>
self._execute(lambda: worker.do_instruction(work), work)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 256, in do_instruction
request.instruction_id)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 272, in process_bundle
bundle_processor.process_bundle(instruction_id)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/bundle_processor.py", line 494, in process_bundle
op.finish()
File "apache_beam/runners/worker/operations.py", line 506, in apache_beam.runners.worker.operations.DoOperation.finish
def finish(self):
File "apache_beam/runners/worker/operations.py", line 507, in apache_beam.runners.worker.operations.DoOperation.finish
with self.scoped_finish_state:
File "apache_beam/runners/worker/operations.py", line 508, in apache_beam.runners.worker.operations.DoOperation.finish
self.dofn_runner.finish()
File "apache_beam/runners/common.py", line 703, in apache_beam.runners.common.DoFnRunner.finish
self._invoke_bundle_method(self.do_fn_invoker.invoke_finish_bundle)
File "apache_beam/runners/common.py", line 697, in apache_beam.runners.common.DoFnRunner._invoke_bundle_method
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 722, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise_with_traceback(new_exn)
File "apache_beam/runners/common.py", line 695, in apache_beam.runners.common.DoFnRunner._invoke_bundle_method
bundle_method()
File "apache_beam/runners/common.py", line 361, in apache_beam.runners.common.DoFnInvoker.invoke_finish_bundle
def invoke_finish_bundle(self):
File "apache_beam/runners/common.py", line 364, in apache_beam.runners.common.DoFnInvoker.invoke_finish_bundle
self.output_processor.finish_bundle_outputs(
File "apache_beam/runners/common.py", line 832, in apache_beam.runners.common._OutputProcessor.finish_bundle_outputs
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 87, in apache_beam.runners.worker.operations.ConsumerSet.receive
self.update_counters_start(windowed_value)
File "apache_beam/runners/worker/operations.py", line 93, in apache_beam.runners.worker.operations.ConsumerSet.update_counters_start
self.opcounter.update_from(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 195, in apache_beam.runners.worker.opcounters.OperationCounters.update_from
self.do_sample(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 213, in apache_beam.runners.worker.opcounters.OperationCounters.do_sample
self.coder_impl.get_estimated_size_and_observables(windowed_value))
File "apache_beam/coders/coder_impl.py", line 953, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
def get_estimated_size_and_observables(self, value, nested=False):
File "apache_beam/coders/coder_impl.py", line 969, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
self._windows_coder.estimate_size(value.windows, nested=True))
File "apache_beam/coders/coder_impl.py", line 758, in apache_beam.coders.coder_impl.SequenceCoderImpl.estimate_size
self.get_estimated_size_and_observables(value))
File "apache_beam/coders/coder_impl.py", line 772, in apache_beam.coders.coder_impl.SequenceCoderImpl.get_estimated_size_and_observables
self._elem_coder.get_estimated_size_and_observables(
File "apache_beam/coders/coder_impl.py", line 134, in apache_beam.coders.coder_impl.CoderImpl.get_estimated_size_and_observables
return self.estimate_size(value, nested), []
File "apache_beam/coders/coder_impl.py", line 458, in apache_beam.coders.coder_impl.IntervalWindowCoderImpl.estimate_size
typed_value = value
TypeError: Cannot convert GlobalWindow to apache_beam.utils.windowed_value._IntervalWindowBase [while running 'generatedPtransform-1090']
java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895)
org.apache.beam.sdk.util.MoreFutures.get(MoreFutures.java:57)
org.apache.beam.runners.dataflow.worker.fn.control.RegisterAndProcessBundleOperation.finish(RegisterAndProcessBundleOperation.java:280)
org.apache.beam.runners.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:84)
org.apache.beam.runners.dataflow.worker.fn.control.BeamFnMapTaskExecutor.execute(BeamFnMapTaskExecutor.java:130)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.process(StreamingDataflowWorker.java:1233)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.access$1000(StreamingDataflowWorker.java:144)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker$6.run(StreamingDataflowWorker.java:972)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.RuntimeException: Error received from SDK harness for instruction -1096: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 148, in _execute
response = task()
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 183, in <lambda>
self._execute(lambda: worker.do_instruction(work), work)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 256, in do_instruction
request.instruction_id)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/sdk_worker.py", line 272, in process_bundle
bundle_processor.process_bundle(instruction_id)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/worker/bundle_processor.py", line 494, in process_bundle
op.finish()
File "apache_beam/runners/worker/operations.py", line 506, in apache_beam.runners.worker.operations.DoOperation.finish
def finish(self):
File "apache_beam/runners/worker/operations.py", line 507, in apache_beam.runners.worker.operations.DoOperation.finish
with self.scoped_finish_state:
File "apache_beam/runners/worker/operations.py", line 508, in apache_beam.runners.worker.operations.DoOperation.finish
self.dofn_runner.finish()
File "apache_beam/runners/common.py", line 703, in apache_beam.runners.common.DoFnRunner.finish
self._invoke_bundle_method(self.do_fn_invoker.invoke_finish_bundle)
File "apache_beam/runners/common.py", line 697, in apache_beam.runners.common.DoFnRunner._invoke_bundle_method
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 722, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise_with_traceback(new_exn)
File "apache_beam/runners/common.py", line 695, in apache_beam.runners.common.DoFnRunner._invoke_bundle_method
bundle_method()
File "apache_beam/runners/common.py", line 361, in apache_beam.runners.common.DoFnInvoker.invoke_finish_bundle
def invoke_finish_bundle(self):
File "apache_beam/runners/common.py", line 364, in apache_beam.runners.common.DoFnInvoker.invoke_finish_bundle
self.output_processor.finish_bundle_outputs(
File "apache_beam/runners/common.py", line 832, in apache_beam.runners.common._OutputProcessor.finish_bundle_outputs
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 87, in apache_beam.runners.worker.operations.ConsumerSet.receive
self.update_counters_start(windowed_value)
File "apache_beam/runners/worker/operations.py", line 93, in apache_beam.runners.worker.operations.ConsumerSet.update_counters_start
self.opcounter.update_from(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 195, in apache_beam.runners.worker.opcounters.OperationCounters.update_from
self.do_sample(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 213, in apache_beam.runners.worker.opcounters.OperationCounters.do_sample
self.coder_impl.get_estimated_size_and_observables(windowed_value))
File "apache_beam/coders/coder_impl.py", line 953, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
def get_estimated_size_and_observables(self, value, nested=False):
File "apache_beam/coders/coder_impl.py", line 969, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
self._windows_coder.estimate_size(value.windows, nested=True))
File "apache_beam/coders/coder_impl.py", line 758, in apache_beam.coders.coder_impl.SequenceCoderImpl.estimate_size
self.get_estimated_size_and_observables(value))
File "apache_beam/coders/coder_impl.py", line 772, in apache_beam.coders.coder_impl.SequenceCoderImpl.get_estimated_size_and_observables
self._elem_coder.get_estimated_size_and_observables(
File "apache_beam/coders/coder_impl.py", line 134, in apache_beam.coders.coder_impl.CoderImpl.get_estimated_size_and_observables
return self.estimate_size(value, nested), []
File "apache_beam/coders/coder_impl.py", line 458, in apache_beam.coders.coder_impl.IntervalWindowCoderImpl.estimate_size
typed_value = value
TypeError: Cannot convert GlobalWindow to apache_beam.utils.windowed_value._IntervalWindowBase [while running 'generatedPtransform-1090']
org.apache.beam.runners.fnexecution.control.FnApiControlClient$ResponseStreamObserver.onNext(FnApiControlClient.java:157)
org.apache.beam.runners.fnexecution.control.FnApiControlClient$ResponseStreamObserver.onNext(FnApiControlClient.java:140)
org.apache.beam.vendor.grpc.v1p13p1.io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:248)
org.apache.beam.vendor.grpc.v1p13p1.io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
org.apache.beam.vendor.grpc.v1p13p1.io.grpc.Contexts$ContextualizedServerCallListener.onMessage(Contexts.java:76)
org.apache.beam.vendor.grpc.v1p13p1.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:263)
org.apache.beam.vendor.grpc.v1p13p1.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:683)
org.apache.beam.vendor.grpc.v1p13p1.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
org.apache.beam.vendor.grpc.v1p13p1.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:123)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
java.lang.Thread.run(Thread.java:745)
Every time it tries to write to GCS, the exception above is shown without in a non-blocking way.
Which leads me to a situation that, when it attempts to output, a new text file is certainly generated but the text content is always the same as the first windowed output. This is obviously unwanted.
The exception is so deeply nested in the stack trace that it is extremely hard to guess what the root cause is, and I have no idea why it ran fine on DirectRunner but not at all on DataflowRunner.
It seems it says somewhere in the pipeline, global windowed values are converted to non-global windowed values, although I used non-global window transform at the second stage of the pipeline. Adding custom triggers didn't help.
I ran into this same error, and found a workaround, but not a fix:
TypeError: Cannot convert GlobalWindow to apache_beam.utils.windowed_value._IntervalWindowBase [while running 'test-file-out/Write/WriteImpl/WriteBundles']
running locally with DirectRunner and on dataflow with DataflowRunner.
Reverting to apache-beam[gcp]==2.9.0 allows my pipeline to run as expected.
I have had so much trouble trying to figure out the
TypeError: Cannot convert GlobalWindow to apache_beam.utils.windowed_value._IntervalWindowBase [while running 'generatedPtransform-1090']
There seems to be something with the WriteToText after beam 2.9.0 (I am using beam 2.14.0, python 3.7)
| "Output" >> beam.io.WriteToText("<GCS path or local path>"))
What made it work for me was removing the pipeline part and radding a custom DoFn:
class WriteToGCS(beam.DoFn):
def __init__(self):
self.outdir = "gs://<project>/<folder>/<file>"
def process(self, element):
from apache_beam.io.filesystems import FileSystems # needed here
import json
writer = FileSystems.create(self.outdir + '.csv', 'text/plain')
writer.write(element)
writer.close()
and in the pipeline add:
| 'Save file' >> beam.ParDo(WriteToGCS())

Categories

Resources