Python Construct - consume data for optional field - python

With the Python construct library, the data I'm parsing has a field which only has meaning if a flag is set.
However, the data field is always present.
Therefore, I would like to consume the data in any case, but only set the field value based on the flag's value.
For example, if the structure is (incorrectly) defined as:
struct = Struct("struct",
Flag("flag"),
UBInt8("optional_data"),
UBInt8("mandatory")
)
For the data:
>>> struct.parse("010203".decode("hex"))
The result should be:
Container({'flag': True, 'mandatory': 3, 'optional_data': 2})
And for data:
>>> struct.parse("000203".decode("hex"))
The desired result is:
Container({'flag': False, 'mandatory': 3, 'optional_data': None})
I have tried the following:
struct = Struct("struct",
Flag("flag"),
IfThenElse("optional_data", lambda ctx: ctx.flag,
UBInt8("dummy"),
Padding(1)
),
UBInt8("mandatory")
)
However, Padding() puts the raw data in the field, like so:
>>> struct.parse("000203".decode("hex"))
Container({'flag': False, 'mandatory': 3, 'optional_data': '\x02'})
Thank you

I am not sure if I understand your problem correctly. If your problem is only that the padding is not parsed as int, then you dont need the IFThenElse. You can check the parsed container in your code for flag and choose to ignore the optional_data field.
struct = Struct("struct",
Flag("flag"),
UBInt8("optional_data"),
UBInt8("mandatory")
)
If your problem is that you want the name Optional Data to be used only when the flag is set but the name dummy to be used when flag is not set, then you need to define two Ifs
struct = Struct("struct",
Flag("flag"),
If("optional_data", lambda ctx: ctx.flag,
UBInt8("useful_byte"),
),
If("dummy", lambda ctx: !ctx.flag,
UBInt8("ignore_byte"),
),
UBInt8("mandatory")
)

Perhaps you could use an Adapter similar to the LengthValueAdapter for a Sequence
class LengthValueAdapter(Adapter):
"""
Adapter for length-value pairs. It extracts only the value from the
pair, and calculates the length based on the value.
See PrefixedArray and PascalString.
Parameters:
* subcon - the subcon returning a length-value pair
"""
__slots__ = []
def _encode(self, obj, context):
return (len(obj), obj)
def _decode(self, obj, context):
return obj[1]
class OptionalDataAdapter(Adapter):
__slots__ = []
def _decode(self, obj, context):
if context.flag:
return obj
else
return None
So
struct = Struct("struct",
Flag("flag"),
OptionalDataAdapter(UBInt8("optional_data")),
UBInt8("mandatory")
)

Related

Airflow - how can I get data from a BigQuery table and use it as a list?

I'm trying to get a column, then use values to create file names.
I've tried the following, which should create a csv with the name of the first value in the column specified. It says the list is empty though when I try to use it
bq_data = []
get_data = BigQueryGetDataOperator(
task_id='get_data_from_bq',
dataset_id='SK22',
table_id='current_times',
max_results='100',
selected_fields='current_timestamps',
)
def process_data_from_bq(**kwargs):
ti = kwargs['ti']
global bq_data
bq_data = ti.xcom_pull(task_ids='get_data_from_bq')
process_data = PythonOperator(
task_id='process_data_from_bq',
python_callable=process_data_from_bq,
provide_context=True)
run_export = BigQueryToCloudStorageOperator(
task_id=f"save_data_on_storage{str(bq_data[0])}",
source_project_dataset_table="a-data-set",
destination_cloud_storage_uris=[f"gs://europe-west1-airflow-bucket/data/test{bq_data[0]}.csv"],
export_format="CSV",
field_delimiter=",",
print_header=False,
dag=dag,
)
get_data >> process_data >> run_export
I think no need to use a PythonOperator between BigQueryGetDataOperator and BigQueryToCloudStorageOperator, you can directly use xcom pull in BigQueryToCloudStorageOperator :
get_data = BigQueryGetDataOperator(
task_id='get_data_from_bq',
dataset_id='SK22',
table_id='current_times',
max_results='100',
selected_fields='current_timestamps',
)
run_export = BigQueryToCloudStorageOperator(
task_id="save_data_on_storage",
source_project_dataset_table="a-data-set",
destination_cloud_storage_uris=[f"gs://europe-west1-airflow-bucket/data/test" + "{{ ti.xcom_pull(task_ids='get_data_from_bq')[0] }}" + ".csv"],
export_format="CSV",
field_delimiter=",",
print_header=False,
dag=dag,
)
get_data >> run_export
destination_cloud_storage_uris is a templated param and you can pass Jinja template syntax inside.
I don't tested the syntax but it should work.
I also don't recommend you using global variable like bq_data to pass data between operators, because it doesn't work, you need to find a way to use xcom directly in the operator (Jinja template or access to the current Context of the operator).
I also noticed that you are not using the latest Airflow operators :
BigQueryToCloudStorageOperator -> BigQueryToGCSOperator
If you want using all the list provided by BigQueryGetDataOperator operator and calculate a list of destination URIs from it, I propose you another solution :
from __future__ import annotations
from typing import List, Dict, Sequence
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import BigQueryToGCSOperator
from google.cloud.bigquery import DEFAULT_RETRY
from urllib3 import Retry
class CustomBigQueryToGCSOperator(BigQueryToGCSOperator):
def __init__(self,
source_project_dataset_table: str,
project_id: str | None = None,
compression: str = "NONE",
export_format: str = "CSV",
field_delimiter: str = ",",
print_header: bool = True,
gcp_conn_id: str = "google_cloud_default",
delegate_to: str | None = None,
labels: dict | None = None,
location: str | None = None,
impersonation_chain: str | Sequence[str] | None = None,
result_retry: Retry = DEFAULT_RETRY,
result_timeout: float | None = None,
job_id: str | None = None,
force_rerun: bool = False,
reattach_states: set[str] | None = None,
deferrable: bool = False,
**kwargs) -> None:
super(CustomBigQueryToGCSOperator, self).__init__(
source_project_dataset_table=source_project_dataset_table,
destination_cloud_storage_uris=[],
project_id=project_id,
compression=compression,
export_format=export_format,
field_delimiter=field_delimiter,
print_header=print_header,
gcp_conn_id=gcp_conn_id,
delegate_to=delegate_to,
labels=labels,
location=location,
impersonation_chain=impersonation_chain,
result_retry=result_retry,
result_timeout=result_timeout,
job_id=job_id,
force_rerun=force_rerun,
reattach_states=reattach_states,
deferrable=deferrable,
**kwargs
)
def execute(self, context):
task_instance = context['task_instance']
data_from_bq: List[Dict] = task_instance.xcom_pull('get_data_from_bq')
destination_cloud_storage_uris: List[str] = list(map(self.to_destination_cloud_storage_uris, data_from_bq))
self.destination_cloud_storage_uris = destination_cloud_storage_uris
super(CustomBigQueryToGCSOperator, self).execute(context)
def to_destination_cloud_storage_uris(self, data_from_bq: Dict) -> str:
return f"gs://europe-west1-airflow-bucket/data/test{data_from_bq['your_field']}.csv"
Example of instantiation of this operator (without destination_cloud_storage_uris field because it's calculated inside the operator):
CustomBigQueryToGCSOperator(
task_id="save_data_on_storage",
source_project_dataset_table="airflow-proj.source_table.attribute_table",
export_format="CSV",
field_delimiter=","
)
Some explanations :
I created a custom operator that extends BigQueryToGCSOperator
In the execute method, I have access to the current context of the operator
From the context, I can retrieve the list from BQ provided by the BigQueryGetDataOperator. I assume it's a list of Dict but you have to confirm this
I calculate a list of destination GCS URIs from this list of Dict
I assign the calculated destination GCS URIs to the corresponding field in the operator
The pros of this solution, you have more flexibility to apply logic based on xcom value.
The cons is it's little verbose.
Just as an update, I realised what I was really trying to do was have dynamic stages in a DAG (hence why I was looking at using this as a list). I solved the issue by creating a function that uses the standard BQ client to get a column and parse it as a list, which I then call in the relevant stages in the dag with the expand function built into BQ. Originally I was looking at just getting the first row of a column, but this was the first stage of what I was trying to do, so the other very detailed and well written answer more than answers this question.

convert string representation of a object using python

I am having string PyObject [methodCall=1001, attributeName=javax.servlet.jsp.jstl.fmt.request.charset, attributeValue=UTF-8]
and in python I am having class :
class PyObject:
def __init__(self, methodCall, attributeName, attributeValue):
self.methodCall = methodCall
self.attributeName = attributeName
self.attributeValue = attributeValue
I am receiving above string from socket I want to convert that received string to object of PyObject so that I can access attributes like pyobj.methodCall.
I have tried with json.loads() but its giving me dict.
Since you added the requirement for this to be dynamic in the comments (you should add it to the question as well) consider this solution.
It is not the most beautiful or elegant, but it gets the job done.
This solution abuses the ability to dynamically create classes and instances with type.
string = 'PyObject [methodCall=1001, attributeName=javax.servlet.jsp.jstl.fmt.request.charset, attributeValue=UTF-8]'
class_name, string = string.split(' ', maxsplit=1)
data = [substring.strip() for substring in string.replace('[', '') \
.replace(']', '').split(',') if substring]
print(data)
# ['methodCall=1001', 'attributeName=javax.servlet.jsp.jstl.fmt.request.charset',
# 'attributeValue=UTF-8']
data_dict = {}
for d in data:
key, value = d.split('=')
data_dict[key] = value
print(data_dict)
# {'attributeValue': 'UTF-8',
# 'attributeName': 'javax.servlet.jsp.jstl.fmt.request.charset', 'methodCall': '1001'}
def init(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)
obj = type(class_name, (), {'__init__': init})(**data_dict)
print(obj)
# <__main__.PyObject object at 0x00520C70>
print(obj.methodCall)
# 1001
print(obj.attributeName)
# javax.servlet.jsp.jstl.fmt.request.charset

MessagePack and datetime

I need a fast way to send 300 short messages a second over zeromq between the python multiprocessing processes. Each message needs to contain an ID and time.time()
msgpack seems like the best way to serialize the dict before sending it via zeromq, and conveniently, msgpack has an example of exactly what I need, except it has a datetime.datetime.now().
import datetime
import msgpack
useful_dict = {
"id": 1,
"created": datetime.datetime.now(),
}
def decode_datetime(obj):
if b'__datetime__' in obj:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
return obj
def encode_datetime(obj):
if isinstance(obj, datetime.datetime):
return {'__datetime__': True, 'as_str': obj.strftime("%Y%m%dT%H:%M:%S.%f")}
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime)
The problem is that their example doesn't work, I get this error:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
KeyError: 'as_str'
Maybe because I'm on python 3.4, but I don't know what the issue with strptime. Would appreciate your help.
Given that messagepack ( import msgpack ) is good at serializing integers, I created a solution which only uses integers:
_datetime_ExtType = 42
def _unpacker_hook(code, data):
if code == _datetime_ExtType:
values = unpack(data)
if len(values) == 8: # we have timezone
return datetime.datetime(*values[:-1], dateutil.tz.tzoffset(None, values[-1]))
else:
return datetime.datetime(*values)
return msgpack.ExtType(code, data)
# This will only get called for unknown types
def _packer_unknown_handler(obj):
if isinstance(obj, datetime.datetime):
if obj.tzinfo:
components = (obj.year, obj.month, obj.day, obj.hour, obj.minute, obj.second, obj.microsecond, int(obj.utcoffset().total_seconds()))
else:
components = (obj.year, obj.month, obj.day, obj.hour, obj.minute, obj.second, obj.microsecond)
# we effectively double pack the values to "compress" them
data = msgpack.ExtType(_datetime_ExtType, pack(components))
return data
raise TypeError("Unknown type: {}".format(obj))
def pack(obj, **kwargs):
# we don't use a global packer because it wouldn't be re-entrant safe
return msgpack.packb(obj, use_bin_type=True, default=_packer_unknown_handler, **kwargs)
def unpack(payload):
try:
# we temporarily disable gc during unpack to bump up perf: https://pypi.python.org/pypi/msgpack-python
gc.disable()
# This must match the above _packer parameters above. NOTE: use_list is faster
return msgpack.unpackb(payload, use_list=False, encoding='utf-8', ext_hook=_unpacker_hook)
finally:
gc.enable()
Python3 and Python2 manage differently strings encoding : encoding-and-decoding-strings-in-python-3-x
Then it is needed to :
use b'as_str' (instead of 'as_str') as dictionary key
use encode and decode for the stored value
Modifying the code like this works with python2 and python3 :
import datetime
import msgpack
useful_dict = {
"id": 1,
"created": datetime.datetime.now(),
}
def decode_datetime(obj):
if b'__datetime__' in obj:
obj = datetime.datetime.strptime(obj[b'as_str'].decode(), "%Y%m%dT%H:%M:%S.%f")
return obj
def encode_datetime(obj):
if isinstance(obj, datetime.datetime):
obj = {'__datetime__': True, 'as_str': obj.strftime("%Y%m%dT%H:%M:%S.%f").encode()}
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime)
In the documentation it says that the default encoding of unicode strings of packb (and pack) is utf-8. You could simply search for the unicode string '__datetime__' in the decode_datetime function, instead of the byte object b'__datetime__', and add the encoding='utf-8' parameter to unpack. Like so:
def decode_datetime(obj):
if '__datetime__' in obj:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime, encoding='utf-8')

Auto format arguments to string

I am writing a function that takes some arguments and builds a SQL query string. I currently have each of the keyword arguments assigned to the string by name, and I am thinking that there has to be an easier way to automatically format the keyword arguments expected in the string. I'm wondering if it is possible to automatically format (or assign) a keyword argument to the query string if the argument name matches.
Here is my current function:
def create_query(
who_id,
what_id=None,
owner_id=None,
subject,
description,
due_date,
is_closed=False,
is_priority=False,
is_reminder=True,
):
query_str = """
'WhoId':'{who_id}',
'Subject':'{subject}',
'Description':'{description}',
'ActivityDate':'{due_date}',
'IsClosed':'{is_closed}',
'IsPriority':'{is_priority}',
'IsReminderSet':'{is_reminder}',
""".format(
who_id=who_id,
subject=subject, # Mapping each of these to the
description=description, # identically named variable
due_date=due_date, # seems dumb
is_closed=is_closed,
is_priority=is_priority,
is_reminder=is_reminder,
)
And I would like something that is more like this:
def create_query(**kwargs):
query_string = """
'WhoId':'{who_id}',
'Subject':'{subject}',
'Description':'{description}',
...
""".format(
for key in **kwargs:
if key == << one of the {keyword} variables in query_string >>:
key = << the matching variable in query_string >>
def create_query(**kwargs):
query_string = """
'WhoId':'{who_id}',
'Subject':'{subject}',
'Description':'{description}',
...
""".format(**kwargs)
print(query_string)
Can be used like this then:
create_query(who_id=123, subject="Boss", description="he's bossy",
will_be_ignored="really?")
Prints:
'WhoId':'123',
'Subject':'Boss',
'Description':'he is bossy',
...
Note the additional parameter will_be_ignored that I entered. It will simply be ignored. You don't have to filter that yourself.
But better don't build the query string yourself. Let the database connector handle that. For example, if my example said "he's bossy" then that would break the query because it uses ' as delimiter. Your database connector should allow you to give it queries with placeholders and values, and then properly replace the placeholders with the values.
Alternative:
def create_query(**kwargs):
parameters = (('WhoId', 'who_id'),
('Subject','subject'),
('Description', 'description'))
query_string = ','.join(r"'{}':'{{{}}}'".format(name, kwargs[id])
for name, id in parameters
if id in kwargs)
print(query_string)
create_query(who_id=123, description="he's bossy",
will_be_ignored="really?")
Prints:
'WhoId':'{123}','Description':'{he's bossy}'

Best way to represent this using Python data structures

<specification>
<propertyName> string </propertyName>
<value>
<number>
<value> anyNumberHere </value>
</number>
<text>
<value> anyTextHere </value>
</text>
<URL>
<value> anyURIHere </value>
</URL>
</value>
<!-- ... 1 or more value nodes here ... -->
</specification>
<!-- ... 1 or more specification nodes here ... -->
I need to construct this request for an API, where user will pass these values. So, what should be the best way to represent this, so that it is easy for the user of the method to pass the respective values to the operation?
I am thinking:
List of List of Dicts:
[specification1, specification2, specification3]
Where:
specification1= [value1, value2]
Where:
value1 = {number:anyNumberHere, text:anyTextHere, URL:anyURIHere}
value2 = {number:anyNumberHere, text:anyTextHere, URL:anyURIHere}
But, I am not able to accommodate: <propertyName> here. Any suggestions?
More over, it sounds way to complicated. Can we have object encapsulation like we do it in Java? I understand, we can, but I am curious, what is the recommended way in python?
My logic for now, suggestions (incorrect due to propertyName):
#specification is a List of List of Dicts
for spec in specification:
specification_elem = etree.SubElement(root, "specification")
propertyName_elem = etree.SubElement(specification_elem,"propertyName")
propertyName_elem.text = spec_propertyName
for value in spec:
value_elem = etree.SubElement(specification_elem, "value")
for key in value:
key_elem = etree.SubElement(value_elem, key)
keyValue_elem = etree.SubElement(key_elem, "value")
keyValue_elem.text = value[key]
Here, I will pass spec_propertyName, as a diff parameter. So, user will pass: specification and spec_propertyName
How about a list of Specification objects, of which each has a property_name and a list of value dictionaries? (values could be a list of objects instead of dictionaries.)
For example:
class Value(object):
def __init__(self,
number=None,
text=None,
url=None):
self.number = number
self.text = text
self.url = url
class Specification(object):
def __init__(self, propertyName):
self.propertyName = propertyName
self.values = []
spec1 = Specification("Spec1")
spec1.values = [Value(number=3,
url="http://www.google.com"),
Value(text="Hello, World!",
url="http://www.jasonfruit.com")]
spec2 = Specification("Spec2")
spec2.values = [Value(number=27,
text="I can haz cheezburger?",
url="http://stackoverflow.com"),
Value(text="Running out of ideas.",
url="http://news.google.com")]
If propertyNames are unique, you can have a dict of list of dict. If they are not, you can have a list of list of list of dict.
If all those lists end up hard to keep track of, you can make a class to store the data:
class Specification:
def __init__(self):
self.propertyName = ""
self.values = []
Then, use:
spec = Specification()
spec.propertyName = "string"
value = {"URL":"someURI", "text":"someText"}
spec.values.append(value)
Here's a representation approach using named tuples. You can upgrade to using classes (adding code to do some validation on the caller's input, and allowing the omission of field=None for optional fields) at your leisure with no other change to the user API and little change to the ElementTree-building code.
# -*- coding: cp1252 -*-
from collections import namedtuple
import xml.etree.cElementTree as etree
Specifications = namedtuple('Specifications', 'specification_list')
Specification = namedtuple('Specification', 'propertyName value_list')
Value = namedtuple('Value', 'number text url')
def make_etree(specifications, encoding):
"""
Convert user's `specifications` to an ElementTree.
`encoding` is encoding of *input* `str` objects.
"""
def ensure_unicode(v):
if isinstance(v, str): return v.decode(encoding)
if isinstance(v, unicode): return v
return unicode(v) # convert numbers etc to unicode strings
root = etree.Element('specifications')
for spec in specifications.specification_list:
specification_elem = etree.SubElement(root, "specification")
propertyName_elem = etree.SubElement(specification_elem, "propertyName")
propertyName_elem.text = ensure_unicode(spec.propertyName)
for value in spec.value_list:
value_elem = etree.SubElement(specification_elem, "value")
for key in value._fields:
kv = getattr(value, key)
if kv is None: continue
key_elem = etree.SubElement(value_elem, key)
keyValue_elem = etree.SubElement(key_elem, "value")
keyValue_elem.text = ensure_unicode(kv)
return etree.ElementTree(root)
# === sample caller code follows ===
specs = Specifications(
specification_list=[
Specification(
propertyName='a prop',
value_list=[
Value(
number=42,
text='universe',
url='http://uww.everywhere',
),
Value(
number=0,
text=None, # optional
url='file:///dev/null',
),
],
),
Specification(
propertyName='b prop',
value_list=[
Value(
number=1,
text='Üñîçøðè', # str object, encoded in cp1252
url=u'Üñîçøðè', # unicode object
),
],
),
],
)
print repr(specs); print
import sys
tree = make_etree(specs, 'cp1252')
import cStringIO
f = cStringIO.StringIO()
tree.write(f, encoding='UTF-8', xml_declaration=True)
print repr(f.getvalue())
print
Output (folded at column 80):
Specifications(specification_list=[Specification(propertyName='a prop', value_li
st=[Value(number=42, text='universe', url='http://uww.everywhere'), Value(number
=0, text=None, url='file:///dev/null')]), Specification(propertyName='b prop', v
alue_list=[Value(number=1, text='\xdc\xf1\xee\xe7\xf8\xf0\xe8', url=u'\xdc\xf1\x
ee\xe7\xf8\xf0\xe8')])])
"<?xml version='1.0' encoding='UTF-8'?>\n<specifications><specification><propert
yName>a prop</propertyName><value><number><value>42</value></number><text><value
>universe</value></text><url><value>http://uww.everywhere</value></url></value><
value><number><value>0</value></number><url><value>file:///dev/null</value></url
></value></specification><specification><propertyName>b prop</propertyName><valu
e><number><value>1</value></number><text><value>\xc3\x9c\xc3\xb1\xc3\xae\xc3\xa7
\xc3\xb8\xc3\xb0\xc3\xa8</value></text><url><value>\xc3\x9c\xc3\xb1\xc3\xae\xc3\
xa7\xc3\xb8\xc3\xb0\xc3\xa8</value></url></value></specification></specification
s>"

Categories

Resources