convert string representation of a object using python - python

I am having string PyObject [methodCall=1001, attributeName=javax.servlet.jsp.jstl.fmt.request.charset, attributeValue=UTF-8]
and in python I am having class :
class PyObject:
def __init__(self, methodCall, attributeName, attributeValue):
self.methodCall = methodCall
self.attributeName = attributeName
self.attributeValue = attributeValue
I am receiving above string from socket I want to convert that received string to object of PyObject so that I can access attributes like pyobj.methodCall.
I have tried with json.loads() but its giving me dict.

Since you added the requirement for this to be dynamic in the comments (you should add it to the question as well) consider this solution.
It is not the most beautiful or elegant, but it gets the job done.
This solution abuses the ability to dynamically create classes and instances with type.
string = 'PyObject [methodCall=1001, attributeName=javax.servlet.jsp.jstl.fmt.request.charset, attributeValue=UTF-8]'
class_name, string = string.split(' ', maxsplit=1)
data = [substring.strip() for substring in string.replace('[', '') \
.replace(']', '').split(',') if substring]
print(data)
# ['methodCall=1001', 'attributeName=javax.servlet.jsp.jstl.fmt.request.charset',
# 'attributeValue=UTF-8']
data_dict = {}
for d in data:
key, value = d.split('=')
data_dict[key] = value
print(data_dict)
# {'attributeValue': 'UTF-8',
# 'attributeName': 'javax.servlet.jsp.jstl.fmt.request.charset', 'methodCall': '1001'}
def init(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)
obj = type(class_name, (), {'__init__': init})(**data_dict)
print(obj)
# <__main__.PyObject object at 0x00520C70>
print(obj.methodCall)
# 1001
print(obj.attributeName)
# javax.servlet.jsp.jstl.fmt.request.charset

Related

MessagePack and datetime

I need a fast way to send 300 short messages a second over zeromq between the python multiprocessing processes. Each message needs to contain an ID and time.time()
msgpack seems like the best way to serialize the dict before sending it via zeromq, and conveniently, msgpack has an example of exactly what I need, except it has a datetime.datetime.now().
import datetime
import msgpack
useful_dict = {
"id": 1,
"created": datetime.datetime.now(),
}
def decode_datetime(obj):
if b'__datetime__' in obj:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
return obj
def encode_datetime(obj):
if isinstance(obj, datetime.datetime):
return {'__datetime__': True, 'as_str': obj.strftime("%Y%m%dT%H:%M:%S.%f")}
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime)
The problem is that their example doesn't work, I get this error:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
KeyError: 'as_str'
Maybe because I'm on python 3.4, but I don't know what the issue with strptime. Would appreciate your help.
Given that messagepack ( import msgpack ) is good at serializing integers, I created a solution which only uses integers:
_datetime_ExtType = 42
def _unpacker_hook(code, data):
if code == _datetime_ExtType:
values = unpack(data)
if len(values) == 8: # we have timezone
return datetime.datetime(*values[:-1], dateutil.tz.tzoffset(None, values[-1]))
else:
return datetime.datetime(*values)
return msgpack.ExtType(code, data)
# This will only get called for unknown types
def _packer_unknown_handler(obj):
if isinstance(obj, datetime.datetime):
if obj.tzinfo:
components = (obj.year, obj.month, obj.day, obj.hour, obj.minute, obj.second, obj.microsecond, int(obj.utcoffset().total_seconds()))
else:
components = (obj.year, obj.month, obj.day, obj.hour, obj.minute, obj.second, obj.microsecond)
# we effectively double pack the values to "compress" them
data = msgpack.ExtType(_datetime_ExtType, pack(components))
return data
raise TypeError("Unknown type: {}".format(obj))
def pack(obj, **kwargs):
# we don't use a global packer because it wouldn't be re-entrant safe
return msgpack.packb(obj, use_bin_type=True, default=_packer_unknown_handler, **kwargs)
def unpack(payload):
try:
# we temporarily disable gc during unpack to bump up perf: https://pypi.python.org/pypi/msgpack-python
gc.disable()
# This must match the above _packer parameters above. NOTE: use_list is faster
return msgpack.unpackb(payload, use_list=False, encoding='utf-8', ext_hook=_unpacker_hook)
finally:
gc.enable()
Python3 and Python2 manage differently strings encoding : encoding-and-decoding-strings-in-python-3-x
Then it is needed to :
use b'as_str' (instead of 'as_str') as dictionary key
use encode and decode for the stored value
Modifying the code like this works with python2 and python3 :
import datetime
import msgpack
useful_dict = {
"id": 1,
"created": datetime.datetime.now(),
}
def decode_datetime(obj):
if b'__datetime__' in obj:
obj = datetime.datetime.strptime(obj[b'as_str'].decode(), "%Y%m%dT%H:%M:%S.%f")
return obj
def encode_datetime(obj):
if isinstance(obj, datetime.datetime):
obj = {'__datetime__': True, 'as_str': obj.strftime("%Y%m%dT%H:%M:%S.%f").encode()}
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime)
In the documentation it says that the default encoding of unicode strings of packb (and pack) is utf-8. You could simply search for the unicode string '__datetime__' in the decode_datetime function, instead of the byte object b'__datetime__', and add the encoding='utf-8' parameter to unpack. Like so:
def decode_datetime(obj):
if '__datetime__' in obj:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime, encoding='utf-8')

Correcting errors in one file and writing it to a new file

Okay so I have a transaction file:
IN CU
Customer_ID=
Last_Name=Johnston
First_Name=Karen
Street_Address=291 Stone Cr
City=Toronto
//
IN VE
License_Plate#=LSR976
Make=Cadillac
Model=Seville
Year=1996
Owner_ID=779
//
IN SE
Vehicle_ID=LSR976
Service_Code=461
Date_Scheduled=00/12/19
IN means insert, and CU (means customer) refers to what file we are writing too, in this case it's customer.diff. The problem I'm having is that I need to go through each line, and check the value of each field (Customer_ID) for example. You see how Customer_ID is left blank? I need to replace any numeric blank fields with a value of 0, so for example Customer_ID=0 in this case. Here's what I have so far but nothing is changing:
def insertion():
field_names = {'Customer_ID=': 'Customer_ID=0',
'Home_Phone=':'Home_Phone=0','Business_Phone=': 'Business_Phone=0'}
with open('xactions.two.txt', 'r') as from_file:
search_lines = from_file.readlines()
if search_lines[3:5] == 'CU':
for i in search_lines:
if field_names[i] == True:
with open('customer.diff', 'w') as to_file:
to_file.write(field_names[i])
Thanks
Why not try something a little simpler? I haven't tested this code.
def insertion():
field_names = {'Customer_ID=': 'Customer_ID=0',
'Home_Phone=':'Home_Phone=0','Business_Phone=': 'Business_Phone=0'}
with open('xactions.two.txt', 'r') as from_file:
with open('customer.diff', 'w') as to_file:
for line in from_file:
line = line.rstrip("\n")
found = False
for field in field_names.keys():
if field in line:
to_file.write(line + "0")
found = True
if not found:
to_file.write(line)
to_file.write("\n")
Here's a fairly comprehensive approach; it's a bit long, but not as complicated as it looks!
I assume Python 3.x, although it should work in Python 2.x with few changes. I make extensive use of generators to stream data through rather than holding it in memory.
To start with: we are going to define the expected data-type for each field. Some fields do not correspond to built-in Python data types, so I start by defining some custom data types for those fields:
import time
class Date:
def __init__(self, s):
"""
Parse a date provided as "yy/mm/dd"
"""
if s.strip():
self.date = time.strptime(s, "%y/%m/%d")
else:
self.date = time.gmtime(0.)
def __str__(self):
"""
Return a date as "yy/mm/dd"
"""
return time.strftime("%y/%m/%d", self.date)
def Int(s):
"""
Parse a string to integer ("" => 0)
"""
if s.strip():
return int(s)
else:
return 0
class Year:
def __init__(self, s):
"""
Parse a year provided as "yyyy"
"""
if s.strip():
self.date = time.strptime(s, "%Y")
else:
self.date = time.gmtime(0.)
def __str__(self):
"""
Return a year as "yyyy"
"""
return time.strftime("%Y", self.date)
Now we set up a table, defining what type each field should be:
# Expected data-type of each field:
# data_types[section][field] = type
data_types = {
"CU": {
"Customer_ID": Int,
"Last_Name": str,
"First_Name": str,
"Street_Address": str,
"City": str
},
"VE": {
"License_Plate#": str,
"Make": str,
"Model": str,
"Year": Year,
"Owner_ID": Int
},
"SE": {
"Vehicle_ID": str,
"Service_Code": Int,
"Date_Scheduled": Date
}
}
We parse the input file; this is by far the most complicated bit! It's a finite state machine implemented as a generator function, yielding a section at a time:
# Customized error-handling
class TransactionError (BaseException): pass
class EntryNotInSectionError (TransactionError): pass
class MalformedLineError (TransactionError): pass
class SectionNotTerminatedError(TransactionError): pass
class UnknownFieldError (TransactionError): pass
class UnknownSectionError (TransactionError): pass
def read_transactions(fname):
"""
Read a transaction file
Return a series of ("section", {"key": "value"})
"""
section, accum = None, {}
with open(fname) as inf:
for line_no, line in enumerate(inf, 1):
line = line.strip()
if not line:
# blank line - skip it
pass
elif line == "//":
# end of section - return any accumulated data
if accum:
yield (section, accum)
section, accum = None, {}
elif line[:3] == "IN ":
# start of section
if accum:
raise SectionNotTerminatedError(
"Line {}: Preceding {} section was not terminated"
.format(line_no, section)
)
else:
section = line[3:].strip()
if section not in data_types:
raise UnknownSectionError(
"Line {}: Unknown section type {}"
.format(line_no, section)
)
else:
# data entry: "key=value"
if section is None:
raise EntryNotInSectionError(
"Line {}: '{}' should be in a section"
.format(line_no, line)
)
pair = line.split("=")
if len(pair) != 2:
raise MalformedLineError(
"Line {}: '{}' could not be parsed as a key/value pair"
.format(line_no, line)
)
key,val = pair
if key not in data_types[section]:
raise UnknownFieldError(
"Line {}: unrecognized field name {} in section {}"
.format(line_no, key, section)
)
accum[key] = val.strip()
# end of file - nothing should be left over
if accum:
raise SectionNotTerminatedError(
"End of file: Preceding {} section was not terminated"
.format(line_no, section)
)
Now that the file is read, the rest is easier. We do type-conversion on each field, using the lookup table we defined above:
def format_field(section, key, value):
"""
Cast a field value to the appropriate data type
"""
return data_types[section][key](value)
def format_section(section, accum):
"""
Cast all values in a section to the appropriate data types
"""
return (section, {key:format_field(section, key, value) for key,value in accum.items()})
and write the results back to file:
def write_transactions(fname, transactions):
with open(fname, "w") as outf:
for section,accum in transactions:
# start section
outf.write("IN {}\n".format(section))
# write key/value pairs in order by key
keys = sorted(accum.keys())
for key in keys:
outf.write(" {}={}\n".format(key, accum[key]))
# end section
outf.write("//\n")
All the machinery is in place; we just have to call it:
def main():
INPUT = "transaction.txt"
OUTPUT = "customer.diff"
transactions = read_transactions(INPUT)
cleaned_transactions = (format_section(section, accum) for section,accum in transactions)
write_transactions(OUTPUT, cleaned_transactions)
if __name__=="__main__":
main()
Hope that helps!

Parse HTML style text annotations to a list of dictionaries

Currently I have the following problem:
Given a string
"<a>annotated <b>piece</b></a> of <c>text</c>"
construct a list such that the result is
[{"annotated": ["a"]}, {"piece": ["a", "b"]}, {"of": []}, {"text": ["c"]}].
My previous attempt looked something like
open_tag = '<[a-z0-9_]+>'
close_tag = '<\/[a-z0-9_]+>'
tag_def = "(" + open_tag + "|" + close_tag + ")"
def tokenize(str):
"""
Takes a string and converts it to a list of words or tokens
For example "<a>foo</a>, of" -> ['<a>', 'foo', '</a>', ',' 'of']
"""
tokens_by_tag = re.split(tag_def, str)
def tokenize(token):
if not re.match(tag_def, token):
return word_tokenize(token)
else:
return [token]
return list(chain.from_iterable([tokenize(token) for token in tokens_by_tag]))
def annotations(tokens):
"""
Process tokens into a list with {word : [tokens]} items
"""
mapping = []
curr = []
for token in tokens:
if re.match(open_tag, token):
curr.append(re.match('<([a-z0-9_]+)>',token).group(1))
elif re.match(close_tag, token):
tag = re.match('<\/([a-z0-9_]+)>',token).group(1)
try:
curr.remove(tag)
except ValueError:
pass
else:
mapping.append({token: list(curr)})
return mapping
Unfortunately this has a flaw since (n=54) resolves to {"n=54" : []} but (n=<n>52</n>) to [{"n=": []}, {52: ["n"]}] thus the length of the two lists differ, making it impossible to merge two different ones later on.
Is there a good strategy for doing parsing HTML/SGML style annotations in a way that two differently annotated (but otherwise identical) strings yield a list of equal size?
Note that I'm well aware that regexps are not suitable for this type of parsing, but also not the problem in this case.
EDIT Fixed a mistake in the example
Your xml data (or html) is not well formed.
Assuming an input file with following xml data well formed:
<root><a>annotated <b>piece</b></a> of <c>text</c></root>
you can use a sax parser to append and pop tags at start and end element events:
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
import sys
class Xml2PseudoJson(ContentHandler):
def __init__(self):
self.tags = []
self.chars = []
self.json = []
def startElement(self, tag, attrs):
d = {''.join(self.chars): self.tags[:]}
self.json.append(d)
self.tags.append(tag)
self.chars = []
def endElement(self, tag):
d = {''.join(self.chars): self.tags[:]}
self.chars = []
self.tags.pop()
self.json.append(d)
def characters(self, content):
self.chars.append(content)
def endDocument(self):
print(list(filter(lambda x: '' not in x, self.json)))
parser = make_parser()
handler = Xml2PseudoJson()
parser.setContentHandler(handler)
parser.parse(open(sys.argv[1]))
Run it like:
python3 script.py xmlfile
That yields:
[
{'annotated ': ['root', 'a']},
{'piece': ['root', 'a', 'b']},
{' of ': ['root']},
{'text': ['root', 'c']}
]

Python Construct - consume data for optional field

With the Python construct library, the data I'm parsing has a field which only has meaning if a flag is set.
However, the data field is always present.
Therefore, I would like to consume the data in any case, but only set the field value based on the flag's value.
For example, if the structure is (incorrectly) defined as:
struct = Struct("struct",
Flag("flag"),
UBInt8("optional_data"),
UBInt8("mandatory")
)
For the data:
>>> struct.parse("010203".decode("hex"))
The result should be:
Container({'flag': True, 'mandatory': 3, 'optional_data': 2})
And for data:
>>> struct.parse("000203".decode("hex"))
The desired result is:
Container({'flag': False, 'mandatory': 3, 'optional_data': None})
I have tried the following:
struct = Struct("struct",
Flag("flag"),
IfThenElse("optional_data", lambda ctx: ctx.flag,
UBInt8("dummy"),
Padding(1)
),
UBInt8("mandatory")
)
However, Padding() puts the raw data in the field, like so:
>>> struct.parse("000203".decode("hex"))
Container({'flag': False, 'mandatory': 3, 'optional_data': '\x02'})
Thank you
I am not sure if I understand your problem correctly. If your problem is only that the padding is not parsed as int, then you dont need the IFThenElse. You can check the parsed container in your code for flag and choose to ignore the optional_data field.
struct = Struct("struct",
Flag("flag"),
UBInt8("optional_data"),
UBInt8("mandatory")
)
If your problem is that you want the name Optional Data to be used only when the flag is set but the name dummy to be used when flag is not set, then you need to define two Ifs
struct = Struct("struct",
Flag("flag"),
If("optional_data", lambda ctx: ctx.flag,
UBInt8("useful_byte"),
),
If("dummy", lambda ctx: !ctx.flag,
UBInt8("ignore_byte"),
),
UBInt8("mandatory")
)
Perhaps you could use an Adapter similar to the LengthValueAdapter for a Sequence
class LengthValueAdapter(Adapter):
"""
Adapter for length-value pairs. It extracts only the value from the
pair, and calculates the length based on the value.
See PrefixedArray and PascalString.
Parameters:
* subcon - the subcon returning a length-value pair
"""
__slots__ = []
def _encode(self, obj, context):
return (len(obj), obj)
def _decode(self, obj, context):
return obj[1]
class OptionalDataAdapter(Adapter):
__slots__ = []
def _decode(self, obj, context):
if context.flag:
return obj
else
return None
So
struct = Struct("struct",
Flag("flag"),
OptionalDataAdapter(UBInt8("optional_data")),
UBInt8("mandatory")
)

Best way to represent this using Python data structures

<specification>
<propertyName> string </propertyName>
<value>
<number>
<value> anyNumberHere </value>
</number>
<text>
<value> anyTextHere </value>
</text>
<URL>
<value> anyURIHere </value>
</URL>
</value>
<!-- ... 1 or more value nodes here ... -->
</specification>
<!-- ... 1 or more specification nodes here ... -->
I need to construct this request for an API, where user will pass these values. So, what should be the best way to represent this, so that it is easy for the user of the method to pass the respective values to the operation?
I am thinking:
List of List of Dicts:
[specification1, specification2, specification3]
Where:
specification1= [value1, value2]
Where:
value1 = {number:anyNumberHere, text:anyTextHere, URL:anyURIHere}
value2 = {number:anyNumberHere, text:anyTextHere, URL:anyURIHere}
But, I am not able to accommodate: <propertyName> here. Any suggestions?
More over, it sounds way to complicated. Can we have object encapsulation like we do it in Java? I understand, we can, but I am curious, what is the recommended way in python?
My logic for now, suggestions (incorrect due to propertyName):
#specification is a List of List of Dicts
for spec in specification:
specification_elem = etree.SubElement(root, "specification")
propertyName_elem = etree.SubElement(specification_elem,"propertyName")
propertyName_elem.text = spec_propertyName
for value in spec:
value_elem = etree.SubElement(specification_elem, "value")
for key in value:
key_elem = etree.SubElement(value_elem, key)
keyValue_elem = etree.SubElement(key_elem, "value")
keyValue_elem.text = value[key]
Here, I will pass spec_propertyName, as a diff parameter. So, user will pass: specification and spec_propertyName
How about a list of Specification objects, of which each has a property_name and a list of value dictionaries? (values could be a list of objects instead of dictionaries.)
For example:
class Value(object):
def __init__(self,
number=None,
text=None,
url=None):
self.number = number
self.text = text
self.url = url
class Specification(object):
def __init__(self, propertyName):
self.propertyName = propertyName
self.values = []
spec1 = Specification("Spec1")
spec1.values = [Value(number=3,
url="http://www.google.com"),
Value(text="Hello, World!",
url="http://www.jasonfruit.com")]
spec2 = Specification("Spec2")
spec2.values = [Value(number=27,
text="I can haz cheezburger?",
url="http://stackoverflow.com"),
Value(text="Running out of ideas.",
url="http://news.google.com")]
If propertyNames are unique, you can have a dict of list of dict. If they are not, you can have a list of list of list of dict.
If all those lists end up hard to keep track of, you can make a class to store the data:
class Specification:
def __init__(self):
self.propertyName = ""
self.values = []
Then, use:
spec = Specification()
spec.propertyName = "string"
value = {"URL":"someURI", "text":"someText"}
spec.values.append(value)
Here's a representation approach using named tuples. You can upgrade to using classes (adding code to do some validation on the caller's input, and allowing the omission of field=None for optional fields) at your leisure with no other change to the user API and little change to the ElementTree-building code.
# -*- coding: cp1252 -*-
from collections import namedtuple
import xml.etree.cElementTree as etree
Specifications = namedtuple('Specifications', 'specification_list')
Specification = namedtuple('Specification', 'propertyName value_list')
Value = namedtuple('Value', 'number text url')
def make_etree(specifications, encoding):
"""
Convert user's `specifications` to an ElementTree.
`encoding` is encoding of *input* `str` objects.
"""
def ensure_unicode(v):
if isinstance(v, str): return v.decode(encoding)
if isinstance(v, unicode): return v
return unicode(v) # convert numbers etc to unicode strings
root = etree.Element('specifications')
for spec in specifications.specification_list:
specification_elem = etree.SubElement(root, "specification")
propertyName_elem = etree.SubElement(specification_elem, "propertyName")
propertyName_elem.text = ensure_unicode(spec.propertyName)
for value in spec.value_list:
value_elem = etree.SubElement(specification_elem, "value")
for key in value._fields:
kv = getattr(value, key)
if kv is None: continue
key_elem = etree.SubElement(value_elem, key)
keyValue_elem = etree.SubElement(key_elem, "value")
keyValue_elem.text = ensure_unicode(kv)
return etree.ElementTree(root)
# === sample caller code follows ===
specs = Specifications(
specification_list=[
Specification(
propertyName='a prop',
value_list=[
Value(
number=42,
text='universe',
url='http://uww.everywhere',
),
Value(
number=0,
text=None, # optional
url='file:///dev/null',
),
],
),
Specification(
propertyName='b prop',
value_list=[
Value(
number=1,
text='Üñîçøðè', # str object, encoded in cp1252
url=u'Üñîçøðè', # unicode object
),
],
),
],
)
print repr(specs); print
import sys
tree = make_etree(specs, 'cp1252')
import cStringIO
f = cStringIO.StringIO()
tree.write(f, encoding='UTF-8', xml_declaration=True)
print repr(f.getvalue())
print
Output (folded at column 80):
Specifications(specification_list=[Specification(propertyName='a prop', value_li
st=[Value(number=42, text='universe', url='http://uww.everywhere'), Value(number
=0, text=None, url='file:///dev/null')]), Specification(propertyName='b prop', v
alue_list=[Value(number=1, text='\xdc\xf1\xee\xe7\xf8\xf0\xe8', url=u'\xdc\xf1\x
ee\xe7\xf8\xf0\xe8')])])
"<?xml version='1.0' encoding='UTF-8'?>\n<specifications><specification><propert
yName>a prop</propertyName><value><number><value>42</value></number><text><value
>universe</value></text><url><value>http://uww.everywhere</value></url></value><
value><number><value>0</value></number><url><value>file:///dev/null</value></url
></value></specification><specification><propertyName>b prop</propertyName><valu
e><number><value>1</value></number><text><value>\xc3\x9c\xc3\xb1\xc3\xae\xc3\xa7
\xc3\xb8\xc3\xb0\xc3\xa8</value></text><url><value>\xc3\x9c\xc3\xb1\xc3\xae\xc3\
xa7\xc3\xb8\xc3\xb0\xc3\xa8</value></url></value></specification></specification
s>"

Categories

Resources