When I try to load the model (input, not meta-model), it returns a MemoryError about 30 seconds after executing.
Expected: List of tree: [{'type':'func', 'callee':'print', 'args':[['Hello']]}]
Actual: MemoryError
Output
Traceback (most recent call last):
File "C:/Users/kenxs/PycharmProjects/program/program/parser.py", line 103, in <module>
main()
File "C:/Users/kenxs/PycharmProjects/program/program/parser.py", line 97, in main
program.do_it(True, True, True)
File "C:/Users/kenxs/PycharmProjects/program/program/parser.py", line 80, in do_it
if cont and intp: cont, err = self.interpret()
File "C:/Users/kenxs/PycharmProjects/program/program/parser.py", line 67, in interpret
self.model = self.mm.model_from_file(os.path.abspath('program.program'))
File "C:\Program Files (x86)\Python38-32\lib\site-packages\textx\metamodel.py", line 574, in model_from_file
return self.internal_model_from_file(file_name, encoding, debug)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\textx\metamodel.py", line 613, in internal_model_from_file
model = self._parser_blueprint.clone().get_model_from_str(
File "C:\Program Files (x86)\Python38-32\lib\site-packages\textx\model.py", line 262, in get_model_from_str
self.parse(model_str, file_name=file_name)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 1493, in parse
self.parse_tree = self._parse()
File "C:\Program Files (x86)\Python38-32\lib\site-packages\textx\model.py", line 221, in _parse
return self.parser_model.parse(self)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 286, in parse
result = self._parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 365, in _parse
result = e.parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 286, in parse
result = self._parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 365, in _parse
result = e.parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 286, in parse
result = self._parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 481, in _parse
result = p(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 286, in parse
result = self._parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 404, in _parse
result = e.parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 286, in parse
result = self._parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 365, in _parse
result = e.parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 286, in parse
result = self._parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 365, in _parse
result = e.parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 286, in parse
result = self._parse(parser)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\arpeggio\__init__.py", line 484, in _parse
append(result)
MemoryError
Grammar
Program:
commands*=Command
;
Command:
Statement | Function | Definition
;
Statement:
callee=ID '(' checker=Checker ')' '=' effect=Collection Ending
;
Checker:
a=Object sign=CheckerSign b=Object
;
CheckerSign:
'==' | '!='
;
Collection:
'[' objs*=PseudoObject ']'
;
PseudoObject:
Object Ending
;
Function:
callee=ID '(' args=Arguments ')' Ending
;
Arguments:
arg*=Argument
;
Argument:
NamedArgument | UnnamedArgument
;
NamedArgument:
a=Object '=' b=Object
;
UnnamedArgument:
a=Object
;
Definition:
a=Object '=' b=Object
;
Object:
a*=ObjectChild
;
ObjectChild:
ObjectChildChild ( '.' | '' )
;
ObjectChildChild:
String | ID | INT | STRICTFLOAT | BOOL | Collection | Function
;
String:
'"' ID '"'
;
Comment:
/#.*/ Ending
;
Ending:
'' *Newline
;
Newline:
( '\n' | ';' )
;
Program
import os
from textx import *
from textx.export import *
class Parser(object):
def __init__(self, meta_model_path='grammar.tx', model_str='print("Hello")'):
self.tree = []
self.meta_model_path = os.path.abspath(meta_model_path)
self.model_str = model_str
self.mm = None
self.model = None
def __str__(self):
return str(self.tree)
def _interpret_function(self, c):
result = {}
result['type'] = 'func'
result['callee'] = c.callee
result['args'] = []
for arg in c.args.arg:
if arg.__class__.__name__ == 'UnnamedArgument':
result['args'].append([arg.a.a])
elif arg.__class__.__name__ == 'NamedArgument':
result['args'].append([arg.a.a, arg.b.a])
return result
def _interpret_definition(self, c):
result = {}
result['type'] = 'defi'
result['a'] = c.a.a
result['b'] = c.b.a
return result
def _interpret_statement(self, c):
result = {}
result['type'] = 'stat'
result['callee'] = c.callee
result['checker_a'] = c.checker.a
result['checker_b'] = c.checker.b
result['checker_sign'] = c.checker.sign
result['effect'] = c.effect.objs
return result
def _interpret(self, model):
for c in model.commands:
if c.__class__.__name__ == 'Statement':
self.tree.append(self._interpret_statement(c))
elif c.__class__.__name__ == 'Function':
self.tree.append(self._interpret_function(c))
elif c.__class__.__name__ == 'Definition':
self.tree.append(self._interpret_definition(c))
def export_meta_model(self, mm):
metamodel_export(self.mm, os.path.abspath('grammar.dot'))
return [True, None]
def export_model(self, model):
model_export(self.model, os.path.abspath('program.dot'))
return [True, None]
def interpret(self):
print(-1)
self.mm = metamodel_from_file(self.meta_model_path, debug=False)
print(0)
try:
self.model = self.mm.model_from_str(self.model_str)
# self.model = self.mm.model_from_file(os.path.abspath('program.prg'))
except TextXSyntaxError as err:
print('Syntax Error # {}:{}'.format(err.line, err.col))
print('{}'.format(err.message))
return [False, err]
print(1)
self._interpret(model)
print(2)
return [True, None]
def do_it(self, exp_mm=False, exp_m=False, intp=True): # My naming skills :)
cont = True
err = None
if cont and intp: cont, err = self.interpret()
if cont and exp_mm: cont, err = self.export_meta_model()
if cont and exp_m: cont, err = self.export_model()
def main(debug=False):
print('Program')
program = Parser()
print('Inp Done')
program.do_it(True, True, True)
print('Done')
print(program)
if __name__ == "__main__":
main()
Rule Ending has zero or more empty string match ''* that is essentially an infinite loop building a parse tree node with an infinite number of empty match terminals. Eventually, the parse tree eats up all the memory and you get MemoryError.
In general, repetitions ('*', '+') over a parsing expression that could potentially be an empty match could lead to an infinite loop.
I suggest that you register an issue in the issue tracker for this as it should be fairly easy to at least detect it at runtime without to much overhead.
Related
import dask.bag as db
class Converter():
def __init__(self,input,output):
"""Converter constructor"""
self.input = input
self.output = output
#staticmethod
def large_file_reader(file_path: str):
"""
File reader
"""
temp_data = db.read_avro(file_path)
data = temp_data.to_dataframe()
# just to check able to by read properly
print(data.head(6))
return data
#staticmethod
def large_file_writer(data, file_path: str) -> bool:
"""
File writer
"""
data.compute().to_csv(file_path, index=False)
def large_file_processor(self):
"Read then write"
input_file_path =self.input
output_file_path =self.output
data = Converter.large_file_reader(input_file_path)
Converter.large_file_writer(data=data, file_path=output_file_path)
if __name__ == "__main__":
c = Converter("/Users/csv_to_avro_new.avro", "/Users/test_avro_new.csv")
c.large_file_processor()
Traceback (most recent call last):
File "/Users/PycharmProjects/ms--py/new.py", line 41, in <module>
c.large_file_processor()
File "/Users/PycharmProjects/ms--py/new.py", line 36, in large_file_processor
Converter.large_file_writer(data=data, file_path=output_file_path)
File "/Users/PycharmProjects/ms--py/new.py", line 28, in large_file_writer
data.compute().to_csv(file_path, index=False)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/base.py", line 315, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/base.py", line 600, in compute
results = schedule(dsk, keys, **kwargs)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/threaded.py", line 89, in get
results = get_async(
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/local.py", line 511, in get_async
raise_exception(exc, tb)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/local.py", line 319, in reraise
raise exc
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/local.py", line 224, in execute_task
result = _execute_task(task, data)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/core.py", line 119, in <genexpr>
return func(*(_execute_task(a, cache) for a in args))
File "/Users/adavsandeep/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/bag/avro.py", line 150, in read_chunk
chunk = read_block(f, off, l, head["sync"])
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/fsspec/utils.py", line 244, in read_block
found_start_delim = seek_delimiter(f, delimiter, 2**16)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/fsspec/utils.py", line 187, in seek_delimiter
current = file.read(blocksize)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/fsspec/implementations/local.py", line 337, in read
return self.f.read(*args, **kwargs)
ValueError: read of closed file
Process finished with exit code 1
`Added the error traceback.check it once.Thanks.
Error
ValueError: read of closed file
Tried convert to some other formats like csv to json in the same way it was working
But not able to convert avro to csv, avro to json and avro to parquet
My file size is more than 2GB .Thats the reason I am using dask.
Thanks in advance .
`
I'm having an issue with TaggedOutputs in Apache Beam (DataflowRunner) using Python 3.9. I've included the necessary pieces of code below for understanding.
Basically the tagged output from parent_check_pipeline for Tag.REQS_SATISFIED) is not working. When the code in CheckParentRequirements yields that tagged output, the pipeline, basically, ends. I get the correct log that the "Element ... has no parents", but the pipeline stops there and doesn't proceed to "Write to Pubsub Topics." I think my meaning can be seen in the dataflow graph I included below as well.
The pipeline definitions for each step are separated into functions for ease of testing. We've used this approach in other beam pipelines and it is working so I'm not sure what's missing here.
Thanks in advance!
Other approaches
I've tried declaring the inputs to "Write to Pubsub" as a tuple:
p_publish_messages = (
(p_check_parents_needed[Tag.REQS_SATISFIED], p_check_parents_exist[Tag.REQS_SATISFIED])
| "Write to Pubsub Topics" >> beam.ParDo(WriteToPubsubMultiple(topic_1, topic_2))
)
which gives the following error:
File ".../lib/python3.9/site-packages/apache_beam/transforms/core.py", line 1578, in expand
is_bounded = pcoll.is_bounded
AttributeError: 'tuple' object has no attribute 'is_bounded'
When using the code defined in publish_messages_pipeline with:
p_publish_messages = publish_messages_pipeline([p_check_parents_needed, p_check_parents_exist], pipeline_params)
I receive:
Traceback (most recent call last):
File "/Users/jimmy.hartman/projects/apiary/ces-ingest-eventing/src/dataflow/parent_check_pipeline.py", line 362, in <module>
run(
File "/Users/jimmy.hartman/projects/apiary/ces-ingest-eventing/src/dataflow/parent_check_pipeline.py", line 317, in run
p_publish_messages = publish_messages_pipeline([p_check_parents_needed, p_check_parents_exist], pipeline_params)
File "/Users/jimmy.hartman/projects/apiary/ces-ingest-eventing/src/dataflow/parent_check_pipeline.py", line 206, in publish_messages_pipeline
tagged_sources
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/transforms/ptransform.py", line 1095, in __ror__
return self.transform.__ror__(pvalueish, self.label)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/transforms/ptransform.py", line 622, in __ror__
p.run().wait_until_finish()
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/pipeline.py", line 574, in run
return self.runner.run_pipeline(self, self._options)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/direct/direct_runner.py", line 131, in run_pipeline
return runner.run_pipeline(pipeline, options)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 199, in run_pipeline
self._latest_run_result = self.run_via_runner_api(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 212, in run_via_runner_api
return self.run_stages(stage_context, stages)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 442, in run_stages
bundle_results = self._execute_bundle(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 770, in _execute_bundle
self._run_bundle(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 999, in _run_bundle
result, splits = bundle_manager.process_bundle(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py", line 1309, in process_bundle
result_future = self._worker_handler.control_conn.push(process_bundle_req)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/portability/fn_api_runner/worker_handlers.py", line 380, in push
response = self.worker.do_instruction(request)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/sdk_worker.py", line 597, in do_instruction
return getattr(self, request_type)(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/sdk_worker.py", line 635, in process_bundle
bundle_processor.process_bundle(instruction_id))
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/bundle_processor.py", line 1003, in process_bundle
input_op_by_transform_id[element.transform_id].process_encoded(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/bundle_processor.py", line 227, in process_encoded
self.output(decoded_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 528, in output
_cast_to_receiver(self.receivers[output_index]).receive(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 240, in receive
self.consumer.process(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 908, in process
delayed_applications = self.dofn_runner.process(o)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1419, in process
self._reraise_augmented(exn)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1491, in _reraise_augmented
raise exn
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1417, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 623, in invoke_process
self.output_handler.handle_process_outputs(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1581, in handle_process_outputs
self._write_value_to_tag(tag, windowed_value, watermark_estimator)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1694, in _write_value_to_tag
self.main_receivers.receive(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 240, in receive
self.consumer.process(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 908, in process
delayed_applications = self.dofn_runner.process(o)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1419, in process
self._reraise_augmented(exn)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1491, in _reraise_augmented
raise exn
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1417, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 623, in invoke_process
self.output_handler.handle_process_outputs(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1581, in handle_process_outputs
self._write_value_to_tag(tag, windowed_value, watermark_estimator)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1694, in _write_value_to_tag
self.main_receivers.receive(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 240, in receive
self.consumer.process(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/worker/operations.py", line 908, in process
delayed_applications = self.dofn_runner.process(o)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1419, in process
self._reraise_augmented(exn)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1507, in _reraise_augmented
raise new_exn.with_traceback(tb)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1417, in process
return self.do_fn_invoker.invoke_process(windowed_value)
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 623, in invoke_process
self.output_handler.handle_process_outputs(
File "/Users/jimmy.hartman/Library/Caches/pypoetry/virtualenvs/up-ces-ingest-eventing-qyT-FGDE-py3.9/lib/python3.9/site-packages/apache_beam/runners/common.py", line 1571, in handle_process_outputs
for result in results:
File "/Users/jimmy.hartman/projects/apiary/ces-ingest-eventing/src/dataflow/parent_check_pipeline.py", line 159, in process
enc_element = json.dumps(element).encode("utf-8")
File "/Users/jimmy.hartman/.pyenv/versions/3.9.13/lib/python3.9/json/__init__.py", line 231, in dumps
return _default_encoder.encode(obj)
File "/Users/jimmy.hartman/.pyenv/versions/3.9.13/lib/python3.9/json/encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/Users/jimmy.hartman/.pyenv/versions/3.9.13/lib/python3.9/json/encoder.py", line 257, in iterencode
return _iterencode(o, 0)
File "/Users/jimmy.hartman/.pyenv/versions/3.9.13/lib/python3.9/json/encoder.py", line 179, in default
raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type _InvalidUnpickledPCollection is not JSON serializable [while running 'Write to Pubsub Topics']
Code
class CheckParentRequirements(DoFn):
def process(self, element, *args, **kwargs):
parents = get_parents(element)
if parents:
logging.getLogger(__name__).warning(f"Element {element} has parents: '{parents}'")
yield TaggedOutput(value=element, tag=Tag.PARENTS_NEEDED)
else:
logging.getLogger(__name__).warning(f"Element {element} has no parents")
yield TaggedOutput(value=element, tag=Tag.REQS_SATISFIED)
class LookupParents(DoFn):
def process(self, element):
missing_parents = self.get_missing_entities(parent_id_map, element)
if missing_parents:
self.logger.info(f"'{element}' missing parents {missing_parents}.")
element.update({Key.MISSING_PARENTS: missing_parents})
yield TaggedOutput(value=element, tag=Tag.MISSING_PARENTS)
else:
self.logger.info(f"'{element}' parents found.")
yield TaggedOutput(value=element, tag=Tag.REQS_SATISFIED)
def get_missing_parents(element):
...
class WriteToPubsubMultiple(DoFn):
def __init__(self, topic_1, topic_2):
self.topic_1 = topic_1
self.topic_2 = topic_2
self.publisher = None
def setup(self):
self.publisher = pubsub_v1.PublisherClient()
def process(self, element, *args, **kwargs):
logger = logging.getLogger(__name__)
enc_element = json.dumps(element).encode("utf-8")
self.publisher.publish(self.topic_1, enc_element)
self.publisher.publish(self.topic_2, enc_element)
logger.info("Sent message messages.")
yield None
def parent_check_pipeline(source) -> DoOutputsTuple:
p_parent_check = (
source
| "Check Parent Requirement"
>> beam.ParDo(CheckParentRequirements()).with_outputs(Tag.PARENTS_NEEDED, Tag.REQS_SATISFIED)
)
return p_parent_check
def lookup_parents_pipeline(source: DoOutputsTuple, params: PipelineParams) -> DoOutputsTuple:
p_parents_exist = source[Tag.PARENTS_NEEDED] | "Lookup Parents" >> beam.ParDo(
LookupParents(params.database_instance_id, params.database_id)
).with_outputs(Tag.MISSING_PARENTS, Tag.REQS_SATISFIED)
return p_parents_exist
def waiting_room_insert_pipeline(source: DoOutputsTuple, params: PipelineParams):
p_waiting_room_rows = (
source[Tag.MISSING_PARENTS]
| "Create Bigtable Rows" >> beam.ParDo(CreateWaitingRoomRows())
| "Bigtable Window"
>> beam.WindowInto(
window.GlobalWindows(),
trigger=Repeatedly(AfterAny(AfterCount(100), AfterProcessingTime(10))),
accumulation_mode=AccumulationMode.DISCARDING,
)
| "Write to Bigtable"
>> WriteToBigTable(params.project_id, params.instance, params.table)
)
return p_waiting_room_rows
# Not using this right now as I was troubleshooting. This is now in the `run()` method.
def publish_messages_pipeline(sources: List[DoOutputsTuple], params: PipelineParams):
tagged_sources = (source[Tag.REQS_SATISFIED] for source in sources)
p_publish_messages = (
tagged_sources
| "Write to Pubsub Topics"
>> beam.ParDo(WriteToPubsubMultiple(params.topic_1, params.topic_2))
)
return p_publish_messages
def run(
pipeline_options,
pipeline_params
):
with Pipeline(options=pipeline_options) as pipeline:
p_source = (
pipeline
| "Read from Pub/Sub" >> io.ReadFromPubSub(subscription=input_subscription)
| "Parse JSON" >> beam.Map(json.loads)
)
p_check_parents_needed = parent_check_pipeline(p_source)
p_check_parents_exist = lookup_parents_pipeline(p_check_parents_needed, pipeline_params)
p_waiting_room_insert = waiting_room_insert_pipeline(p_check_parents_exist, pipeline_params)
p_publish_messages = (
p_check_parents_needed[Tag.REQS_SATISFIED], p_check_parents_exist[Tag.REQS_SATISFIED]
| "Write to Pubsub Topics" >> beam.ParDo(WriteToPubsubMultiple(topic_1, topic_2))
)
Dataflow graph:
At a first glance, I see nothing wrong with your tagged outputs (assuming Tag.WHATEVER returns a string). However, I am a bit confused about the way you outsource pipeline parts. Usually, you would use PTransforms instead of simple python functions. That might be the source of your strange behavior.
I would recommend rewriting all your pipeline methods to PTransforms, e.g.
class ParentCheckPipeline(beam.PTransform):
def expand(self, source):
p_parent_check = (
source
| "Check Parent Requirement" >> beam.ParDo(CheckParentRequirements())
.with_outputs(Tag.PARENTS_NEEDED, Tag.REQS_SATISFIED)
)
return p_parent_check
Note the mandatory expand method, containing your pipeline part.
Turns out I needed to take advantage of the apache_beam.Flatten() transform:
Flatten is a Beam transform for PCollection objects that store the same data type. Flatten merges multiple PCollection objects into a single logical PCollection.
The new publish_messages_pipeline definition would look like this:
def publish_messages_pipeline(sources: List[DoOutputsTuple], params: PipelineParams):
tagged_sources = (source[Tag.REQS_SATISFIED] for source in sources)
p_publish_messages = (
tagged_sources
| "Flatten inputs" >> beam.Flatten()
| "Write to Pubsub Topics"
>> beam.ParDo(WriteToPubsubMultipleFn(params.topic_1, params.topic_2))
)
return p_publish_messages
Taking the suggestion from #CaptainNabla, I put the "composite transform" code for inserting into bigtable into a class extending PTransform. This not only organizes the code, but allows the graph in Dataflow to be expanded/collapsed for inspection.
class InsertIntoWaitingRoom(PTransform):
def __init__(self, params: PipelineParams):
super(InsertIntoWaitingRoom, self).__init__()
self.params = params
def expand(self, source: InputT) -> OutputT:
return (
source
| "Create Bigtable Rows" >> beam.ParDo(CreateWaitingRoomRowsFn())
| "Bigtable Window"
>> beam.WindowInto(
window.GlobalWindows(),
trigger=Repeatedly(AfterAny(AfterCount(100), AfterProcessingTime(10))),
accumulation_mode=AccumulationMode.DISCARDING,
)
| "Write to Bigtable"
>> WriteToBigTable(
self.params.project, self.params.instance, self.params.table
)
)
collapsed
expanded
TL;DR: The issue will be fixed in version 3.0 of TextX. The workaround is to use regex for matching escaped (\) characters, such as \n.
FULL QUESTION: Using TextX, I am parsing a homegrown mark-up language, where paragraph and line breaks are significant. I think I am missing a fundamental understanding when trying to match new lines: Why are "\n" and "\n\n" not working, while their regex counterparts /\n/ and /\n\n/ do?
NOTE: whitespace is redefined at parser level to exclude \n using ws=" \t".
import textx as tx
grammar = r"""
Root:
content*=Content
;
Content:
Text | ParagraphBreak | LineBreak
;
ParagraphBreak:
paragraphbreak="\n\n"
// paragraphbreak=/\n\n/
;
LineBreak:
linebreak="\n" // Will cause parsing error
// linebreak=/\n/ // Will parse fine
;
Text[noskipws]: // All text valid
text=/[^\n]*/
;
"""
parser = tx.metamodel_from_str(grammar, ws=" \t")
source = "Line.\nBreak.\n\n"
parsed_source = parser.model_from_str(source)
print(parsed_source.content)
When running the above code on my system, using
Python 3.10.1
Poetry version 1.1.12, from poetry.lock:
[[package]] name = "arpeggio", version = "1.10.2", ..., python-versions = "*"
[[package]] name = "textx", version = "2.3.0", ..., python-versions = "*", [package.dependencies] Arpeggio = ">=1.9.0"
I get the following result:
With root of paths: /Users/[redacted]/Library/Caches/pypoetry/virtualenvs.
File ".../[redacted]-py3.10/lib/python3.10/site-packages/textx/model.py", line 291, in _parse
return self.parser_model.parse(self)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 291, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 370, in _parse
result = e.parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 789, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 945, in _parse
parser._nm_raise(self, c_pos, parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 1718, in _nm_raise
raise self.nm
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 485, in _parse
result = p(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 291, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 423, in _parse
parser._nm_raise(self, c_pos, parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 1718, in _nm_raise
raise self.nm
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 409, in _parse
result = e.parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 291, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 370, in _parse
result = e.parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 291, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 370, in _parse
result = e.parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 789, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 898, in _parse
parser._nm_raise(self, c_pos, parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 1718, in _nm_raise
raise self.nm
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 409, in _parse
result = e.parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 291, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 370, in _parse
result = e.parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 291, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 370, in _parse
result = e.parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 789, in parse
result = self._parse(parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 898, in _parse
parser._nm_raise(self, c_pos, parser)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 1718, in _nm_raise
raise self.nm
arpeggio.NoMatch: Expected '\n\n' or '\n' or EOF at position (1, 6) => 'Line.* Break. '.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/[redacted]/scratchpad/TextX/linebreaks.py", line 31, in <module>
parsed_source = parser.model_from_str(source)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/textx/metamodel.py", line 615, in model_from_str
model = self._parser_blueprint.clone().get_model_from_str(
File ".../[redacted]-py3.10/lib/python3.10/site-packages/textx/model.py", line 332, in get_model_from_str
self.parse(model_str, file_name=file_name)
File ".../[redacted]-py3.10/lib/python3.10/site-packages/arpeggio/__init__.py", line 1516, in parse
self.parse_tree = self._parse()
File ".../[redacted]-py3.10/lib/python3.10/site-packages/textx/model.py", line 294, in _parse
raise TextXSyntaxError(message=text(e),
textx.exceptions.TextXSyntaxError: None:1:6: error: Expected '\n\n' or '\n' or EOF at position (1, 6) => 'Line.* Break. '.
I was expecting the same result as the regex version, which is:
[<textx:Text instance at 0x10129bc40>, <textx:LineBreak instance at 0x101298040>, <textx:Text instance at 0x101298130>, <textx:ParagraphBreak instance at 0x10129aec0>]
It is the problem addressed in the current development version. Please see this textX issue.
The fix will be a part of the upcoming textX 3.0 release.
I am trying to update an existing document and I keep getting a typeError (quote_from_bytes() expected bytes)
my code:
couch = couchdb.Server("<http>")
couch.resource.credentials = ("USERNAME","PASSWORD")
db = couch['mydb']
id = "183848484"
doc = db[id]
"""doing some operations on data here"""
db[id] = doc
My error log:
self.db[self.docID] = doc File
"/home/sunilgopikrishna/.local/lib/python3.5/site-packages/couchdb/client.py",
line 427, in setitem
resource = _doc_resource(self.resource, id) File "/home/sunilgopikrishna/.local/lib/python3.5/site-packages/couchdb/client.py",
line 1057, in _doc_resource
return base(doc_id) File "/home/sunilgopikrishna/.local/lib/python3.5/site-packages/couchdb/http.py",
line 537, in call
obj = type(self)(urljoin(self.url, *path), self.session) File "/home/sunilgopikrishna/.local/lib/python3.5/site-packages/couchdb/http.py",
line 678, in urljoin
path = '/'.join([''] + [quote(s) for s in path]) File "/home/sunilgopikrishna/.local/lib/python3.5/site-packages/couchdb/http.py",
line 678, in
path = '/'.join([''] + [quote(s) for s in path]) File "/home/sunilgopikrishna/.local/lib/python3.5/site-packages/couchdb/http.py",
line 630, in quote
return util.urlquote(string, safe) File "/usr/lib/python3.5/urllib/parse.py", line 712, in quote
return quote_from_bytes(string, safe) File "/usr/lib/python3.5/urllib/parse.py", line 737, in quote_from_bytes
raise TypeError("quote_from_bytes() expected bytes") TypeError: quote_from_bytes() expected bytes
Thanks.
There was an error in the coding (I had missed a ')
I have added a small debugging aid to my server. It logs a stack trace obtained from traceback.format_stack()
It contains few incomplete lines like this:
File "/home/...../base/loop.py", line 361, in run
self.outputs.fd_list, (), sleep)
which is not that much helpfull.
The source lines 360 and 361:
rlist, wlist, unused = select.select(self.inputs.fd_list,
self.outputs.fd_list, (), sleep)
If only one line can be part of the stack trace, I would say the line 360 with the function name (here select.select) is the right one, because the stack is created by calling functions.
Anyway, I would prefer the whole (logical) line to be printed. Or at least some context (e.g. 2 lines before). Is that possible? I mean with just an adequate effort, of course.
Tried to add a line continuation character \, but without success.
EPILOGUE:
Based on Jean-François Fabre's answer and his code I'm going to use this function:
def print_trace():
for fname, lnum, func, line in traceback.extract_stack()[:-1]:
print('File "{}", line {}, in {}'.format(fname, lnum, func))
try:
with open(fname) as f:
rl = f.readlines()
except OSError:
if line is not None:
print(" " + line + " <===")
continue
first = max(0, lnum-3)
# read 2 lines before and 2 lines after
for i, line in enumerate(rl[first:lnum+2]):
line = line.rstrip()
if i + first + 1 == lnum:
print(" " + line + " <===")
elif line:
print(" " + line)
"just with adequate effort" this can be done. But it's hack-like
check this example:
import traceback,re,os,sys
r = re.compile(r'File\s"(.*)",\sline\s(\d+)')
def print_trace():
# discard the 2 deepest entries since they're a call to print_trace()
lines = [str.split(x,"\n")[0] for x in traceback.format_stack()][:-2]
for l in lines:
m = r.search(l)
if m != None:
sys.stdout.write(l+"\n")
file = m.group(1)
line = int(m.group(2))-1
if os.path.exists(file):
with open(file,"r") as f:
rl = f.readlines()
tblines = rl[max(line-2,0):min(line+3,len(rl))]
# read 2 lines before and 2 lines after
for i,tl in enumerate(tblines):
tl = tl.rstrip()
if i==2:
sys.stdout.write(" "+tl+" <====\n")
elif tl:
sys.stdout.write(" "+tl+"\n")
def foo():
print_trace()
foo()
output:
File "C:\Users\dartypc\AppData\Roaming\PyScripter\remserver.py", line 63, in <module>
if __name__ == "__main__":
main() <====
File "C:\Users\dartypc\AppData\Roaming\PyScripter\remserver.py", line 60, in main
t = SimpleServer(ModSlaveService, port = port, auto_register = False)
t.start() <====
if __name__ == "__main__":
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\utils\server.py", line 227, in start
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\utils\server.py", line 139, in accept
File "C:\Users\dartypc\AppData\Roaming\PyScripter\remserver.py", line 14, in _accept_method
class SimpleServer(Server):
def _accept_method(self, sock):
self._serve_client(sock, None) <====
class ModSlaveService(SlaveService):
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\utils\server.py", line 191, in _serve_client
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\core\protocol.py", line 391, in serve_all
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\core\protocol.py", line 382, in serve
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\core\protocol.py", line 350, in _dispatch
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\core\protocol.py", line 298, in _dispatch_request
File "C:\Program Files\PyScripter\Lib\rpyc.zip\rpyc\core\protocol.py", line 528, in _handle_call
File "<string>", line 420, in run_nodebug
File "C:\DATA\jff\data\python\stackoverflow\traceback_test.py", line 31, in <module>
print_trace()
foo() <====
EDIT: VPfB suggested the use of extract_stack which is a little less "hacky", no need to parse a string, just get the quadruplet with traceback info (needs to rebuild the text message, but that's better)
import traceback,os,sys
def print_trace():
# discard the 2 deepest entries since they're a call to print_trace()
for file,line,w1,w2 in traceback.extract_stack()[:-2]:
sys.stdout.write(' File "{}", line {}, in {}\n'.format(file,line,w1))
if os.path.exists(file):
line -= 1
with open(file,"r") as f:
rl = f.readlines()
tblines = rl[max(line-2,0):min(line+3,len(rl))]
# read 2 lines before and 2 lines after
for i,tl in enumerate(tblines):
tl = tl.rstrip()
if i==2:
sys.stdout.write(" "+tl+" <====\n")
elif tl:
sys.stdout.write(" "+tl+"\n")
def foo():
print_trace()
foo()
The traceback.format_exception_only function format only one line, except in case of SyntaxError, so…