Python Cerberus - One is required if another one does not exist - python

What I want to achieve:
>>> from cerberus import Validator
>>> schema = {"x": {"type": "integer", "required": False}, "y": {"type": "integer", "required": False}}
>>> v = Validator(schema)
>>> v.validate({"x": 5})
True
>>> v.validate({"y": 6})
True
>>> v.validate({"x": 5, "y": 6})
True
>>> v.validate({})
False
I have checked all the document but still don't know how to achieve this result. How should I define the schema?

The only viable solution is to use Validator() multiple times.
from cerberus import Validator
def composite_validator(document):
REQUIRED_INTEGER = {"type": 'integer', "required": True}
OPTIONAL_INTEGER = {"type": 'integer', "required": False}
schemas = [
{"x": REQUIRED_INTEGER, "y": OPTIONAL_INTEGER},
{"x": OPTIONAL_INTEGER, "y": REQUIRED_INTEGER},
]
common_schema = {"z1": REQUIRED_INTEGER, "z2": OPTIONAL_INTEGER, "z3": REQUIRED_INTEGER}
for s in schemas:
s.update(common_schema)
validator = Validator()
return any(validator(document, s) for s in schemas)
Test results:
for case in [
{"x": 5, "z1": 0, "z3": -1},
{"y": 6, "z1": 0, "z3": -1},
{"x": 5, "y": 6, "z1": 0, "z3": -1},
{"z1": 0, "z3": -1}]:
print(case)
print(composite_validator(case))
#{'x': 5, 'z1': 0, 'z3': -1}
#True
#{'y': 6, 'z1': 0, 'z3': -1}
#True
#{'x': 5, 'y': 6, 'z1': 0, 'z3': -1}
#True
#{'z1': 0, 'z3': -1}
#False

Related

how can match two list and remove duplicate based on condition on dictionary in list

I want to merge two lists and get data matched without duplicates and to alias them to new structure
I have two lists
here is a given two list try to merge
cats = [
'orange', 'apple', 'banana'
]
and second list
types = [
{
"id": 1,
"type": "orange"
},
{
"id": 2,
"type": "apple"
},
{
"id": 3,
"type": "apple"
},
{
"id": 4,
"type": "orange"
},
{
"id": 5,
"type": "banana"
}
]
and I want to combine them to get this result:
[
{'orange': {
'UNIT': [1, 4]
}
},
{'apple': {
'UNIT': [2, 3]
}
},
{'banana': {
'UNIT': [5]
}
}
]
and my code, this after my tries i get this result :
for item in types:
for cat in cats:
if item['type'] == cat:
matched.append(
{
cat: {
"UNIT": [i['id'] for i in types if
'id' in i]
}
}
)
and my result is like this
[{'orange': {'UNIT': [1, 2, 3, 4, 5]}},
{'apple': {'UNIT': [1, 2, 3, 4, 5]}},
{'apple': {'UNIT': [1, 2, 3, 4, 5]}},
{'orange': {'UNIT': [1, 2, 3, 4, 5]}},
{'banana': {'UNIT': [1, 2, 3, 4, 5]}}]
Your problem is the in inside your list comprehension - beside that your code is complex. You get multiples due to your for loops and never checking if that fruit was already added to matched.
To reduce the 2 lists to the needed values you can use
cats = [ 'orange', 'apple', 'banana']
types = [ { "id": 1, "type": "orange" },
{ "id": 2, "type": "apple" },
{ "id": 3, "type": "apple" },
{ "id": 4, "type": "orange" },
{ "id": 5, "type": "banana" }]
rv = {}
for inner in types:
t = inner["type"]
if t not in cats: # for current data not needed, only needed
continue # if some listelements don't occure in dict
rv.setdefault(t, [])
rv[t].append(inner["id"])
print(rv)
which leads to an easier dictionary with all the data you need:
{'orange': [1, 4], 'apple': [2, 3], 'banana': [5]}
From there you can build up your overly complex list of dicts with 1 key each:
lv = [{k:{"UNIT":v}} for k,v in rv.items()]
print (lv)
to get
[{'orange': {'UNIT': [1, 4]}},
{'apple': {'UNIT': [2, 3]}},
{'banana': {'UNIT': [5]}}]
Answer to extended problem from comment:
If you need to add more things you need to capture you can leverage the fact that lists and dicts store by ref:
cats = [ 'orange', 'apple', 'banana']
types = [ { "id": 1, "type": "orange" , "bouncyness": 42 },
{ "id": 2, "type": "apple" , "bouncyness": 21 },
{"id": 3, "type": "apple" , "bouncyness": 63},
{ "id": 4, "type": "orange" , "bouncyness": 84},
{ "id": 5, "type": "banana" , "bouncyness": 99}]
rv = [] # list of dicts - single key is "fruitname"
pil = {} # dict that keeps track which fruit is on what position in rv
# to avoid iterating over list to find correct fruit-dict
di = None # the current fruits dictionary
for inner in types:
t = inner["type"]
if t not in cats: # for current data not needed, only needed
continue # if some listelements don't occure in dict
# step1: find correct fruit dict in rv or create new one and add it
di = None
if t in pil:
# get cached dict by position of fruit in rv
di = rv[pil[t]]
else:
# create fruit dict, cache position in rv in pil
di = {}
rv.append(di)
pil[t] = len(rv)-1
# step2: create all the needed inner lists
# you can speed this up using defaultdict(list) if speed gets
# problematic - until then dict.setdefault should be fine
di.setdefault(t, [])
di.setdefault("bouncyness", [])
# step3: fill with values
di[t].append(inner["id"])
di["bouncyness"].append(inner["bouncyness"])
print(rv)
to get
[{'orange': [1, 4], 'bouncyness': [42, 84]},
{'apple': [2, 3], 'bouncyness': [21, 63]},
{'banana': [5], 'bouncyness': [99]}]
Here is an alternative approach using filter() method and lambda.
dict_lst = []
for cat in cats:
cat_items = filter(lambda x: x['type'] == cat, types)
cat_dict = {cat: {'UNIT': [x['id'] for x in cat_items]}}
dict_lst.append(cat_dict)
print(dict_lst)
[{'orange': {'UNIT': [1, 4]}},
{'apple': {'UNIT': [2, 3]}},
{'banana': {'UNIT': [5]}}]
With list comprehension:
[{cat:{'UNIT':[type_['id'] for type_ in types if type_['type'] == cat]}} for cat in cats]

Finding a circular path in a list of dicts (Python)

I have data that looks something like this
mydict = [
{"id": 0, "item_total": 10000, "send_to_id": None},
{"id": 1, "item_total": 15000, "send_to_id": None},
{"id": 2, "item_total": 30000, "send_to_id": 1},
{"id": 3, "item_total": 20000, "send_to_id": None},
...
]
where the id is always the dict's position in the list.
The item_totals will be aggregated, and the "send_to_id" key affects how this aggregation occurs. Here, as the dict with "id" = 2 has "send_to_id" = 1, the dict with "id" = 1 is then what I'm calling a destination layer and the totals of these two items will be aggregated differently to normal.
What is not allowed is something circular like this, where item 1 points to item 2 and item 2 points to item 1.
mydict = [
{"id": 0, "item_total": 10000, "send_to_id": None},
{"id": 1, "item_total": 15000, "send_to_id": 2},
{"id": 2, "item_total": 30000, "send_to_id": 1},
{"id": 3, "item_total": 20000, "send_to_id": None}
]
Or this, where it is still circular and takes three steps
mydict = [
{"id": 0, "item_total": 10000, "send_to_id": None},
{"id": 1, "item_total": 15000, "send_to_id": 3},
{"id": 2, "item_total": 30000, "send_to_id": 1},
{"id": 3, "item_total": 20000, "send_to_id": 2}
]
Also an item cannot point to itself.
I'm not sure how to go about this, but I would like to take in a list of dicts like the above and find out if a circular path exists so I can provide an appropriate error message to the user. Can anyone help?
I'm trying to work out how to follow a path to the next item but it ties knots in my brain.
If it matters, the maximum length of the list will be about 50.
Thanks!
A good first step might be to create a dictionary of just the maps. We can do that with a nice dictionary comprehension (look below the second green code example block).
def get_direct_mappings(data: list) -> dict:
return {d["id"]: d["send_to_id"] for d in data if d["send_to_id"] is not None}
Now we have a slightly more easily solved problem, and we can use previously made solutions to help us out. Specifically, we can basically just re-use the solution as posted after "Update", and add our above code to it.
def find_cycles(original_data: list) -> list:
n = {d["id"]: d["send_to_id"] for d in original_data if d["send_to_id"] is not None}
cycles = []
while n:
visited = {}
count = 0
k, v = n.popitem()
while v is not None:
# visited[k] = (count, v)
visited[k] = count
count += 1
k = v
v = n.pop(k, None)
if k in visited:
if len(visited) == 1:
cycle = tuple(visited.keys())
else:
cycle_start = visited[k]
cycle = sorted((c, k) for k, c in visited.items() if c >= cycle_start)
cycle = tuple(k for c, k in cycle)
k = min(range(len(cycle)), key=lambda x: cycle[x])
cycle = cycle[k:] + cycle[:k]
cycles.append(cycle)
return cycles
While it's not the prettiest, it works.
mydict = [
{"id": 0, "item_total": 10000, "send_to_id": None},
{"id": 1, "item_total": 15000, "send_to_id": 3},
{"id": 2, "item_total": 30000, "send_to_id": 1},
{"id": 3, "item_total": 20000, "send_to_id": 2}
]
print(find_cycles(mydict))
# prints [(1, 3, 2)]
mydict = [
{"id": 0, "item_total": 10000, "send_to_id": None},
{"id": 1, "item_total": 15000, "send_to_id": 2},
{"id": 2, "item_total": 30000, "send_to_id": 1},
{"id": 3, "item_total": 20000, "send_to_id": None}
]
print(find_cycles(mydict))
# prints [(1, 2)]
mydict = [
{"id": 0, "item_total": 10000, "send_to_id": None},
{"id": 1, "item_total": 15000, "send_to_id": None},
{"id": 2, "item_total": 30000, "send_to_id": 1},
{"id": 3, "item_total": 20000, "send_to_id": None},
]
print(find_cycles(mydict))
# prints []

Json data ordering PANDAS, python

I have json like this:
json = {
"b": 22,
"x": 12,
"a": 2,
"c": 4
}
When i generate an Excel file from this json like this:
import pandas as pd
df = pd.read_json(json_text)
file_name = 'test.xls'
file_path = "/tmp/" + file_name
df.to_excel(file_path, index=False)
print("path to excel " + file_path)
Pandas does its own ordering in the Excel file like this:
pandas_json = {
"a": 2,
"b": 22,
"c": 4,
"x": 12
}
I don't want this. I need the ordering which exists in the json. Please give me some advice how to do this.
UPDATE:
if i have json like this:
json = [
{"b": 22, "x":12, "a": 2, "c": 4},
{"b": 22, "x":12, "a": 2, "c": 2},
{"b": 22, "x":12, "a": 4, "c": 4},
]
pandas will generate its own ordering like this:
panas_json = [
{"a": 2, "b":22, "c": 4, "x": 12},
{"a": 2, "b":22, "c": 2, "x": 12},
{"a": 4, "b":22, "c": 4, "x": 12},
]
How can I make pandas preserve my own ordering?
You can read the json as OrderedDict which will help to retain original order:
import json
from collections import OrderedDict
json_ = """{
"b": 22,
"x": 12,
"a": 2,
"c": 4
}"""
data = json.loads(json_, object_pairs_hook=OrderedDict)
pd.DataFrame.from_dict(data,orient='index')
0
b 22
x 12
a 2
c 4
Edit, updated json also works:
j="""[{"b": 22, "x":12, "a": 2, "c": 4},
{"b": 22, "x":12, "a": 2, "c": 2},{"b": 22, "x":12, "a": 4, "c": 4}]"""
data = json.loads(j, object_pairs_hook=OrderedDict)
pd.DataFrame.from_dict(data).to_json(orient='records')
'[{"b":22,"x":12,"a":2,"c":4},{"b":22,"x":12,"a":2,"c":2},
{"b":22,"x":12,"a":4,"c":4}]'

Mongoengine aggregation return empty cursor

If I execute aggregation query with matched expression:
>>> devices = [1,2,3,4,5,6] # devices ID's
>>> c = View.objects.aggregate(
{"$match": {"d": {"$in": devices},"f": {"$ne": 1}}},
{"$group":{'_id':"uniqueDocs",'count':{"$sum":1}}}
)
I'm getting result:
>>> list(c)
[{u'count': 2874791, u'_id': u'uniqueDocs'}]
But if execute query with expression not matched:
>>> now = datetime.utcnow().replace(tzinfo=tz.gettz('UTC'))
>>> current_hour_start = now.replace(minute=0, second=0, microsecond=0)
>> c = View.objects.aggregate(
{"$match": {"d": {"$in": devices}, "b": {"$gte": current_hour_start}, "f": {"$ne": 1}}},
{"$group": {'_id': "uniqueDocs", 'count': {"$sum": 1}}})
I'm getting empty cursor:
list(c)
[]
How me get zero count?
as:
>>> list(c)
[{u'count': 0, u'_id': u'uniqueDocs'}]
Update:
Example dataset and expected result.
>>> View.objects()
{
_id: ObjectId("578f79b877824688fc0d68ed") }, {
$set: {
"d": 1, /* device ID */
"i": 1899,
"s": 1,
"a": 1,
"m": 0,
"f": 0,
"b": ISODate("2016-07-20T08:35:56.066Z"), /* begin time */
"e": ISODate("2016-07-20T08:35:57.965Z") /* end time */
}
},
{
_id: ObjectId("578f79b877824688fc0d68ee") }, {
$set: {
"d": 2,
"i": 2456,
"s": 1,
"a": 1,
"m": 0,
"f": 0,
"b": ISODate("2016-07-20T08:37:26.066Z"),
"e": ISODate("2016-07-20T08:37:28.965Z")
}
},
{
_id: ObjectId("578f79b877824688fc0d68ef") }, {
$set: {
"d": 1000,/* !!! ignore this document (no matched device ID) */
"i": 2567,
"s": 1,
"a": 1,
"m": 0,
"f": 0,
"b": ISODate("2016-07-20T08:35:56.066Z"),
"e": ISODate("2016-07-20T08:35:57.965Z")
}
}
>>> c = View.objects.aggregate(
{"$match": {"d": {"$in": devices},"f": {"$ne": 1}}},
{"$group":{'_id':"uniqueDocs",'count':{"$sum":1}}}
).next()['count']
2

How to traverse and build a multidimensional dictionary without using recursion

I have a json message where at the highest level, I have a dictionary with unknown depth and structures, and am looking to traverse it to format it, ending up with a new, formatted dictionary. After using timeit, I found it to be very slow and discovered that recursion in python is not very quick at all. All that being understood, I don't know how to actually transform my recursive function "Foo.format_it" into a loop based one, if possible.
import time
import json
class Foo(object):
def __init__(self):
self.msg_out = {}
self.msg_in = None
self.sample_data = """
{
"data": {
"a": "",
"b": "",
"c": "127.0.0.1",
"d": 80,
"e": {"f": false,"g": false,"h": false,"i": false,"j": false,"k": false},
"l": [ {"ii": 2, "hh": 10, "gg": 200, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": 3, "ee": 0},
{"ii": 5, "hh": 20, "gg": 300, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1},
{"ii": 5, "hh": 30, "gg": -400, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1}],
"m": true,
"n": true,
"o": 1000,
"p": 2000,
"q": "",
"r": 5,
"s": 0,
"t": true,
"u": true,
"v": {"jj": 5, "kk": 0, "ll": 10, "mm": 9, "nn": [ { "aa": 20, "bb": 30 }, { "aa": 20, "bb": 30 } ] }
}
}
"""
def format(self, msg_in):
print msg_in
self.msg_in = json.loads( msg_in )
self.msg_out = {}
self.format_it(self.msg_in, self.msg_out)
import pprint
print pprint.pformat(self.msg_out)
return json.dumps( self.msg_out )
def ff(self, val, out_struct):
if int(val) < 0:
out_struct[u'ff'] = ""
else:
out_struct[u'ff'] = str(val)
def format_it(self, item, out_struct):
if isinstance(item, dict):
for dict_key, dict_val in item.iteritems():
if dict_key in dir(self):
dict_key = getattr(self, dict_key)(dict_val, out_struct)
if dict_key:
if isinstance(dict_val, dict):
out_struct[dict_key] = {}
self.format_it(dict_val, out_struct[dict_key])
elif isinstance(dict_val, list):
out_struct[dict_key] = []
self.format_it(dict_val, out_struct[dict_key])
else:
out_struct[dict_key] = dict_val
elif isinstance(item, list):
for list_val in item:
if isinstance(list_val, dict):
out_struct.append({})
self.format_it(list_val, out_struct[-1])
elif isinstance(list_val, list):
out_struct.append([])
self.format_it(list_val, out_struct[-1])
else:
out_struct.append(list_val)
else:
pass
if __name__ == "__main__":
tic = time.clock()
f = Foo()
f.format(f.sample_data)
print (time.clock()-tic)
Here is the in data and out data per request, in the simplest case, only the key 'ff' needed to be formatted and so -1 became an empty string:
[IN]
{
"data": {
"a": "",
"b": "",
"c": "127.0.0.1",
"d": 80,
"e": {"f": false,"g": false,"h": false,"i": false,"j": false,"k": false},
"l": [ {"ii": 2, "hh": 10, "gg": 200, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": 3, "ee": 0},
{"ii": 5, "hh": 20, "gg": 300, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1},
{"ii": 5, "hh": 30, "gg": -400, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1}],
"m": true,
"n": true,
"o": 1000,
"p": 2000,
"q": "",
"r": 5,
"s": 0,
"t": true,
"u": true,
"v": {"jj": 5, "kk": 0, "ll": 10, "mm": 9, "nn": [ { "aa": 20, "bb": 30 }, { "aa": 20, "bb": 30 } ] }
}
}
[OUT]
{u'data': {u'a': u'',
u'b': u'',
u'c': u'127.0.0.1',
u'd': 80,
u'e': {u'f': False,
u'g': False,
u'h': False,
u'i': False,
u'j': False,
u'k': False},
u'l': [{u'aa': -1,
u'bb': -1,
u'cc': -1,
u'dd': 3,
u'ee': 0,
u'ff': '',
u'gg': 200,
u'hh': 10,
u'ii': 2},
{u'aa': -1,
u'bb': -1,
u'cc': -1,
u'dd': -1,
u'ee': -1,
u'ff': '',
u'gg': 300,
u'hh': 20,
u'ii': 5},
{u'aa': -1,
u'bb': -1,
u'cc': -1,
u'dd': -1,
u'ee': -1,
u'ff': '',
u'gg': -400,
u'hh': 30,
u'ii': 5}],
u'm': True,
u'n': True,
u'o': 1000,
u'p': 2000,
u'q': u'',
u'r': 5,
u's': 0,
u't': True,
u'u': True,
u'v': {u'jj': 5,
u'kk': 0,
u'll': 10,
u'mm': 9,
u'nn': [{u'aa': 20, u'bb': 30}, {u'aa': 20, u'bb': 30}]}}}
The code is a bit pared down and uses tic/toc vs timeit. In using both, the execution of just the recursion seems to be around .0012s (where I even remove the object creation and json load from the time calculation).

Categories

Resources