In my Python Utilities Github repo I have a function which removes strips nonprinting characters and invalid Unicode bytes from strings, mappings, and sequences:
def filterCharacters(s):
"""
Strip non printable characters
#type s dict|list|tuple|bytes|string
#param s Object to remove non-printable characters from
#rtype dict|list|tuple|bytes|string
#return An object that corresponds with the original object, nonprintable characters removed.
"""
validCategories = (
'Lu', 'Ll', 'Lt', 'LC', 'Lm', 'Lo', 'L', 'Mn', 'Mc', 'Me', 'M', 'Nd', 'Nl', 'No', 'N', 'Pc',
'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'P', 'Sm', 'Sc', 'Sk', 'So', 'S', 'Zs', 'Zl', 'Zp', 'Z'
)
convertToBytes = False
if isinstance(s, dict):
new = {}
for k,v in s.items(): # This is the offending line
new[k] = filterCharacters(v)
return new
if isinstance(s, list):
new = []
for item in s:
new.append(filterCharacters(item))
return new
if isinstance(s, tuple):
new = []
for item in s:
new.append(filterCharacters(item))
return tuple(new)
if isinstance(s, bytes):
s = s.decode('utf-8')
convertToBytes = True
if isinstance(s, str):
s = ''.join(c for c in s if unicodedata.category(c) in validCategories)
if convertToBytes:
s = s.encode('utf-8')
return s
else:
return None
Sometimes this function throws an exception:
Traceback (most recent call last):
File "./util.py", line 56, in filterCharacters
for k,v in s.items():
RuntimeError: dictionary changed size during iteration
I don't see where I am changing the dictionary sent as an argument. Why is this exception being thrown, then?
Thanks!
In python 3 dict.items() returns dict_view object (not list as in python 2). Looking through CPython code I've noticed comments like
Objects/dictobject.c
dict_items(register PyDictObject *mp)
{
...
/* Preallocate the list of tuples, to avoid allocations during
* the loop over the items, which could trigger GC, which
* could resize the dict. :-(
*/
...
if (n != mp->ma_used) {
/* Durnit. The allocations caused the dict to resize.
* Just start over, this shouldn't normally happen.
*/
Py_DECREF(v);
goto again;
}
...
}
So not only dict deletions and insertions could cause this error to show but any allocations! oO!
Procedure of resizing is also interesting. Look at
static int
dictresize(PyDictObject *mp, Py_ssize_t minused)
{
...
}
But that's all internals.
Solution
Try converting dict_view to list with
if isinstance(s, dict):
new = {}
items = [i for i in s.items()]
for k,v in items:
new[k] = filterCharacters(v)
return new
Related
I'm looking for any suggestions to resolve an issue I'm facing. It might seem as a simple problem, but after a few days trying to find an answer - I think it is not anymore.
I'm receiving data (StringType) in a following JSON-like format, and there is a requirement to turn it into flat key-value pair dictionary. Here is a payload sample:
s = """{"status": "active", "name": "{\"first\": \"John\", \"last\": \"Smith\"}", "street_address": "100 \"Y\" Street"}"""
and the desired output should look like this:
{'status': 'active', 'name_first': 'John', 'name_last': 'Smith', 'street_address': '100 "Y" Street'}
The issue is I can't find a way to turn original string (s) into a dictionary. If I can achieve that the flattening part is working perfectly fine.
import json
import collections
import ast
#############################################################
# Flatten complex structure into a flat dictionary
#############################################################
def flatten_dictionary(dictionary, parent_key=False, separator='_', value_to_str=True):
"""
Turn a nested complex json into a flattened dictionary
:param dictionary: The dictionary to flatten
:param parent_key: The string to prepend to dictionary's keys
:param separator: The string used to separate flattened keys
:param value_to_str: Force all returned values to string type
:return: A flattened dictionary
"""
items = []
for key, value in dictionary.items():
new_key = str(parent_key) + separator + key if parent_key else key
try:
value = json.loads(value)
except BaseException:
value = value
if isinstance(value, collections.MutableMapping):
if not value.items():
items.append((new_key,None))
else:
items.extend(flatten_dictionary(value, new_key, separator).items())
elif isinstance(value, list):
if len(value):
for k, v in enumerate(value):
items.extend(flatten_dictionary({str(k): (str(v) if value_to_str else v)}, new_key).items())
else:
items.append((new_key,None))
else:
items.append((new_key, (str(value) if value_to_str else value)))
return dict(items)
# Data sample; sting and dictionary
s = """{"status": "active", "name": "{\"first\": \"John\", \"last\": \"Smith\"}", "street_address": "100 \"Y\" Street"}"""
d = {"status": "active", "name": "{\"first\": \"John\", \"last\": \"Smith\"}", "street_address": "100 \"Y\" Street"}
# Works for dictionary type
print(flatten_dictionary(d))
# Doesn't work for string type, for any of the below methods
e = eval(s)
# a = ast.literal_eval(s)
# j = json.loads(s)
Try:
import json
import re
def jsonify(s):
s = s.replace('"{','{').replace('}"','}')
s = re.sub(r'street_address":\s+"(.+)"(.+)"(.+)"', r'street_address": "\1\2\3"',s)
return json.loads(s)
If you must keep the quotes around Y, try:
def jsonify(s):
s = s.replace('"{','{').replace('}"','}')
search = re.search(r'street_address":\s+"(.+)"(.+)"(.+)"',s)
if search:
s = re.sub(r'street_address":\s+"(.+)"(.+)"(.+)"', r'street_address": "\1\2\3"',s)
dict_version = json.loads(s)
dict_version['street_address'] = dict_version['street_address'].replace(search.group(2),'"'+search.group(2)+'"')
return dict_version
A more generalized attempt:
def jsonify(s):
pattern = r'(?<=[,}])\s*"(.[^\{\}:,]+?)":\s+"([^\{\}:,]+?)"([^\{\}:,]+?)"([^\{\}:,]+?)"([,\}])'
s = s.replace('"{','{').replace('}"','}')
search = re.search(pattern,s)
matches = []
if search:
matches = re.findall(pattern,s)
s = re.sub(pattern, r'"\1": "\2\3\4"\5',s)
dict_version = json.loads(s)
for match in matches:
dict_version[match[0]] = dict_version[match[0]].replace(match[2],'"'+match[2]+'"')
return dict_version
Let's say I've got a nested dictionary of the form:
{'geo': {'bgcolor': 'white','lakecolor': 'white','caxis': {'gridcolor': 'white', 'linecolor': 'white',}},
'title': {'x': 0.05},
'yaxis': {'automargin': True,'linecolor': 'white','zerolinecolor': 'white','zerolinewidth': 2}
}
How can you work your way through that dict and make a list of each complete key path that contains the value 'white'?
Using a function defined by user jfs in the post Search for a value in a nested dictionary python lets you check whether or not 'white' occurs at least one time and also returns the path:
# dictionary
d={'geo': {'bgcolor': 'white','lakecolor': 'white','caxis': {'gridcolor': 'white', 'linecolor': 'white',}},
'title': {'x': 0.05},
'yaxis': {'automargin': True,'linecolor': 'white','ticks': '','zerolinecolor': 'white','zerolinewidth': 2}
}
# function:
def getpath(nested_dict, value, prepath=()):
for k, v in nested_dict.items():
path = prepath + (k,)
if v == value: # found value
return path
elif hasattr(v, 'items'): # v is a dict
p = getpath(v, value, path) # recursive call
if p is not None:
return p
getpath(d,'white')
# out:
('geo', 'bgcolor')
But 'white' occurs other places too, like in :
1. d['geo']['lakecolor']
2: d['geo']['caxis']['gridcolor']
3: d['yaxis']['linecolor']
How can I make sure that the function finds all paths?
I've tried applying the function above until it returns none while eliminating found paths one by one, but that quickly turned into an ugly mess.
Thank you for any suggestions!
This is a perfect use case to write a generator:
def find_paths(haystack, needle):
if haystack == needle:
yield ()
if not isinstance(haystack, dict):
return
for key, val in haystack.items():
for subpath in find_paths(val, needle):
yield (key, *subpath)
You can use it as follows:
d = {
'geo': {'bgcolor': 'white','lakecolor': 'white','caxis': {'gridcolor': 'white', 'linecolor': 'white',}},
'title': {'x': 0.05},
'yaxis': {'automargin': True,'linecolor': 'white','ticks': '','zerolinecolor': 'white','zerolinewidth': 2}
}
# you can iterate over the paths directly...
for path in find_paths(d, 'white'):
print('found at path: ', path)
# ...or you can collect them into a list:
paths = list(find_paths(d, 'white'))
print('found at paths: ' + repr(paths))
The generator approach has the advantage that it doesn't need to create an object to keep all paths in memory at once; they can be processed one by one and immediately discarded. In this case, the memory savings would be rather modest, but in others they may be significant. Also, if a loop iterating over a generator is terminated early, the generator is not going to keep searching for more paths that would be later discarded anyway.
just transform your function so it returns a list and don't return when something is found. Just add to/extend the list
def getpath(nested_dict, value, prepath=()):
p = []
for k, v in nested_dict.items():
path = prepath + (k,)
if v == value: # found value
p.append(path)
elif hasattr(v, 'items'): # v is a dict
p += getpath(v, value, path) # recursive call
return p
with your input data, this produces (order may vary depending on python versions where dictionaries are unordered):
[('yaxis', 'linecolor'), ('yaxis', 'zerolinecolor'), ('geo', 'lakecolor'),
('geo', 'caxis', 'linecolor'), ('geo', 'caxis', 'gridcolor'), ('geo', 'bgcolor')]
Returning is what makes the result incomplete. Instead of returning, use a separate list to track your paths. I'm using list cur_list here, and returning it at the very end of the loop:
d = {
'geo': {'bgcolor': 'white',
'caxis': {'gridcolor': 'white', 'linecolor': 'white'},
'lakecolor': 'white'},
'title': {'x': 0.05},
'yaxis': {'automargin': True,
'linecolor': 'white',
'ticks': '',
'zerolinecolor': 'white',
'zerolinewidth': 2}
}
cur_list = []
def getpath(nested_dict, value, prepath=()):
for k, v in nested_dict.items():
path = prepath + (k,)
if v == value: # found value
cur_list.append(path)
elif isinstance(v, dict): # v is a dict
p = getpath(v, value, path, cur_list) # recursive call
if p is not None:
cur_list.append(p)
getpath(d,'white')
print(cur_list)
# RESULT:
# [('geo', 'bgcolor'), ('geo', 'caxis', 'gridcolor'), ('geo', 'caxis', 'linecolor'), ('geo', 'lakecolor'), ('yaxis', 'linecolor'), ('yaxis', 'zerolinecolor')]
I needed this functionality for traversing HDF files with h5py. This code is a slight alteration of the answer by user114332 which looks for keys instead of values, and additionally yields the needle in the result, in case it is useful to someone else.
import h5py
def find_paths(haystack, needle):
if not isinstance(haystack, h5py.Group) and not isinstance(haystack, dict):
return
if needle in haystack:
yield (needle,)
for key, val in haystack.items():
for subpath in find_paths(val, needle):
yield (key, *subpath)
Execution:
sf = h5py.File("file.h5py", mode = "w")
g = sf.create_group("my group")
h = g.create_group("my2")
k = sf.create_group("two group")
l = k.create_group("my2")
a = l.create_group("my2")
for path in find_paths(sf, "my2"):
print('found at path: ', path)
which prints the following
found at path: ('my group', 'my2')
found at path: ('two group', 'my2')
found at path: ('two group', 'my2', 'my2')
There is a way to initialize structure with dictionary:
fooData= {'y': 1, 'x': 2}
fooStruct = ffi.new("foo_t*", fooData)
fooBuffer = ffi.buffer(fooStruct)
Is there some ready function to do the conversion?
fooStruct = ffi.new("foo_t*")
(ffi.buffer(fooStruct))[:] = fooBuffer
fooData= convert_to_python( fooStruct[0] )
Do I have to use ffi.typeof("foo_t").fields by myself?
I come up with this code so far:
def __convert_struct_field( s, fields ):
for field,fieldtype in fields:
if fieldtype.type.kind == 'primitive':
yield (field,getattr( s, field ))
else:
yield (field, convert_to_python( getattr( s, field ) ))
def convert_to_python(s):
type=ffi.typeof(s)
if type.kind == 'struct':
return dict(__convert_struct_field( s, type.fields ) )
elif type.kind == 'array':
if type.item.kind == 'primitive':
return [ s[i] for i in range(type.length) ]
else:
return [ convert_to_python(s[i]) for i in range(type.length) ]
elif type.kind == 'primitive':
return int(s)
Is there a faster way?
Arpegius' solution works fine for me, and is quite elegant. I implemented a solution based on Selso's suggestion to use inspect. dir() can substitute inspect.
from inspect import getmembers
from cffi import FFI
ffi = FFI()
from pprint import pprint
def cdata_dict(cd):
if isinstance(cd, ffi.CData):
try:
return ffi.string(cd)
except TypeError:
try:
return [cdata_dict(x) for x in cd]
except TypeError:
return {k: cdata_dict(v) for k, v in getmembers(cd)}
else:
return cd
foo = ffi.new("""
struct Foo {
char name[6];
struct {
int a, b[3];
} item;
} *""",{
'name': b"Foo",
'item': {'a': 3, 'b': [1, 2, 3]}
})
pprint(cdata_dict(foo))
Output:
{'item': {'a': 3, 'b': [1, 2, 3]}, 'name': b'Foo'}
This code infortunately does not work for me, as some struct members are "pointer" types, it leads to storing "none" in the dict.
I am a Python noob, but maybe the inspect module would be another starting point, and a shorter way to print "simple" data. Then we would iterate over the result in order to unroll data structure.
For example with the following example :
struct foo {
int a;
char b[10];
};
Using inspect.getmembers( obj ) I have the following result :
[('a', 10), ('b', <cdata 'char[10]' 0x7f0be10e2824>)]
Your code is fine.
Even if there was a built-in way in CFFI, it would not be what you need here. Indeed, you can say ffi.new("foo_t*", {'p': p1}) where p1 is another cdata, but you cannot recursively pass a dictionary containing more dictionaries. The same would be true in the opposite direction: you would get a dictionary that maps field names to "values", but the values themselves would be more cdata objects anyway, and not recursively more dictionaries.
Sorry for my english
I have a list like:
[['string type','short int type','long int type','string type','float'],
['Stackoverflow','32','0','any stringgg','55.0'],
['anystring','16','1654657987984','striiingg','2.5']]
I call:
['string type','short int type','long int type','string type','float']
is the first sub-list
and
['Stackoverflow','32','0','any stringgg','55.0']
is the second sub-list, same for the three sub-list
How can I use struct.pack() data in the second & third sub-lists based on the type of the first sub-list?
You could do something like this (quickly coded, could use some work)
import struct
type_map = {
'string type': 's',
'short int type': 'h',
'long int type': 'q',
'float': 'f'
}
conversion = {
's': str,
'h': int,
'q': int,
'f': float
}
def do_pack(types, data):
if len(types) != len(data):
raise Excpetion("wrong lengths")
packing = '<'
data_iter = []
for i, struct_type in enumerate(types):
t = type_map[struct_type]
if t == 's':
packing += '%ds' % len(data[i])
data_iter.append(data[i])
else:
packing += t
data_iter.append(conversion[t](data[i]))
return struct.pack(packing, *data_iter), packing
packer = [['string type','short int type','long int type','string type','float'],['Stackoverflow','32','0','any stringgg','55.0'],['anystring','16','1654657987984','striiingg','2.5']]
types = packer[0]
for data_set in packer[1:]:
binary, packing = do_pack(types, data_set)
print struct.unpack(packing, binary)
OUTPUT
('Stackoverflow', 32, 0, 'any stringgg', 55.0)
('anystring', 16, 1654657987984, 'striiingg', 2.5)
I have this code
import json
from pprint import pprint
json_data=open('bookmarks.json')
jdata = json.load(json_data)
pprint (jdata)
json_data.close()
How can I search through it for u'uri': u'http:?
ObjectPath is a library that provides ability to query JSON and nested structures of dicts and lists. For example, you can search for all attributes called "foo" regardless how deep they are by using $..foo.
While the documentation focuses on the command line interface, you can perform the queries programmatically by using the package's Python internals. The example below assumes you've already loaded the data into Python data structures (dicts & lists). If you're starting with a JSON file or string you just need to use load or loads from the json module first.
import objectpath
data = [
{'foo': 1, 'bar': 'a'},
{'foo': 2, 'bar': 'b'},
{'NoFooHere': 2, 'bar': 'c'},
{'foo': 3, 'bar': 'd'},
]
tree_obj = objectpath.Tree(data)
tuple(tree_obj.execute('$..foo'))
# returns: (1, 2, 3)
Notice that it just skipped elements that lacked a "foo" attribute, such as the third item in the list. You can also do much more complex queries, which makes ObjectPath handy for deeply nested structures (e.g. finding where x has y that has z: $.x.y.z). I refer you to the documentation and tutorial for more information.
As json.loads simply returns a dict, you can use the operators that apply to dicts:
>>> jdata = json.load('{"uri": "http:", "foo", "bar"}')
>>> 'uri' in jdata # Check if 'uri' is in jdata's keys
True
>>> jdata['uri'] # Will return the value belonging to the key 'uri'
u'http:'
Edit: to give an idea regarding how to loop through the data, consider the following example:
>>> import json
>>> jdata = json.loads(open ('bookmarks.json').read())
>>> for c in jdata['children'][0]['children']:
... print 'Title: {}, URI: {}'.format(c.get('title', 'No title'),
c.get('uri', 'No uri'))
...
Title: Recently Bookmarked, URI: place:folder=BOOKMARKS_MENU(...)
Title: Recent Tags, URI: place:sort=14&type=6&maxResults=10&queryType=1
Title: , URI: No uri
Title: Mozilla Firefox, URI: No uri
Inspecting the jdata data structure will allow you to navigate it as you wish. The pprint call you already have is a good starting point for this.
Edit2: Another attempt. This gets the file you mentioned in a list of dictionaries. With this, I think you should be able to adapt it to your needs.
>>> def build_structure(data, d=[]):
... if 'children' in data:
... for c in data['children']:
... d.append({'title': c.get('title', 'No title'),
... 'uri': c.get('uri', None)})
... build_structure(c, d)
... return d
...
>>> pprint.pprint(build_structure(jdata))
[{'title': u'Bookmarks Menu', 'uri': None},
{'title': u'Recently Bookmarked',
'uri': u'place:folder=BOOKMARKS_MENU&folder=UNFILED_BOOKMARKS&(...)'},
{'title': u'Recent Tags',
'uri': u'place:sort=14&type=6&maxResults=10&queryType=1'},
{'title': u'', 'uri': None},
{'title': u'Mozilla Firefox', 'uri': None},
{'title': u'Help and Tutorials',
'uri': u'http://www.mozilla.com/en-US/firefox/help/'},
(...)
}]
To then "search through it for u'uri': u'http:'", do something like this:
for c in build_structure(jdata):
if c['uri'].startswith('http:'):
print 'Started with http'
Seems there's a typo (missing colon) in the JSON dict provided by jro.
The correct syntax would be:
jdata = json.load('{"uri": "http:", "foo": "bar"}')
This cleared it up for me when playing with the code.
Functions to search through and print dicts, like JSON.
*made in python 3
Search:
def pretty_search(dict_or_list, key_to_search, search_for_first_only=False):
"""
Give it a dict or a list of dicts and a dict key (to get values of),
it will search through it and all containing dicts and arrays
for all values of dict key you gave, and will return you set of them
unless you wont specify search_for_first_only=True
:param dict_or_list:
:param key_to_search:
:param search_for_first_only:
:return:
"""
search_result = set()
if isinstance(dict_or_list, dict):
for key in dict_or_list:
key_value = dict_or_list[key]
if key == key_to_search:
if search_for_first_only:
return key_value
else:
search_result.add(key_value)
if isinstance(key_value, dict) or isinstance(key_value, list) or isinstance(key_value, set):
_search_result = pretty_search(key_value, key_to_search, search_for_first_only)
if _search_result and search_for_first_only:
return _search_result
elif _search_result:
for result in _search_result:
search_result.add(result)
elif isinstance(dict_or_list, list) or isinstance(dict_or_list, set):
for element in dict_or_list:
if isinstance(element, list) or isinstance(element, set) or isinstance(element, dict):
_search_result = pretty_search(element, key_to_search, search_result)
if _search_result and search_for_first_only:
return _search_result
elif _search_result:
for result in _search_result:
search_result.add(result)
return search_result if search_result else None
Print:
def pretty_print(dict_or_list, print_spaces=0):
"""
Give it a dict key (to get values of),
it will return you a pretty for print version
of a dict or a list of dicts you gave.
:param dict_or_list:
:param print_spaces:
:return:
"""
pretty_text = ""
if isinstance(dict_or_list, dict):
for key in dict_or_list:
key_value = dict_or_list[key]
if isinstance(key_value, dict):
key_value = pretty_print(key_value, print_spaces + 1)
pretty_text += "\t" * print_spaces + "{}:\n{}\n".format(key, key_value)
elif isinstance(key_value, list) or isinstance(key_value, set):
pretty_text += "\t" * print_spaces + "{}:\n".format(key)
for element in key_value:
if isinstance(element, dict) or isinstance(element, list) or isinstance(element, set):
pretty_text += pretty_print(element, print_spaces + 1)
else:
pretty_text += "\t" * (print_spaces + 1) + "{}\n".format(element)
else:
pretty_text += "\t" * print_spaces + "{}: {}\n".format(key, key_value)
elif isinstance(dict_or_list, list) or isinstance(dict_or_list, set):
for element in dict_or_list:
if isinstance(element, dict) or isinstance(element, list) or isinstance(element, set):
pretty_text += pretty_print(element, print_spaces + 1)
else:
pretty_text += "\t" * print_spaces + "{}\n".format(element)
else:
pretty_text += str(dict_or_list)
if print_spaces == 0:
print(pretty_text)
return pretty_text
You can use jsonpipe if you just need the output (and more comfortable with command line):
cat bookmarks.json | jsonpipe |grep uri