Cerberus coercion within nested list - python

I get unexpected behaviour for the following code:
import cerberus
v = cerberus.Validator()
schema = {'list_of_values': {'type': 'list',
'schema': {'items': [{'type': 'string', 'coerce': str},
{'type': 'integer', 'coerce': int}]}}
}
document = {'list_of_values': [['hello', 100], [123, "122"]]}
v.validate(document, schema)
v.errors
I am expecting to have no errors, as the coercion should take care of the types. But I am getting
{'list_of_values': [{1: [{0: ['must be of string type'],
1: ['must be of integer type']}]}]}
Is this a bug? Am I misunderstanding how the coercion works?

#funky-future
Something not right on your end, I can indeed reproduce the problem just by copy paste the example into the prompt:
>>> import cerberus
>>> v = cerberus.Validator()
>>> schema = {'list_of_values': {'type': 'list',
... 'schema': {'items': [{'type': 'string', 'coerce': str},
... {'type': 'integer', 'coerce': int}]}}
... }
>>> document = {'list_of_values': [['hello', 100], [123, "122"]]}
>>> v.validate(document, schema)
False
>>> v.errors
{'list_of_values': [{1: [{0: ['must be of string type'], 1: ['must be of integer type']}]}]}
Python3.5.2, cerberus1.2

Related

RDD to DF conversion

I am new to Pyspark. My code looks something like below. I am not sure why df.collect() is showing None values for all the string values.
>> rdd = sc.parallelize([{'name': 'test', 'age': {"id": 326, "first_name": "Will", "last_name": "Cur"}},
{'name': 'test2', 'age': {"id": 751, "first_name": "Will", "last_name": "Mc"}}])
>> rdd.collect()
[{'name': 'test', 'age': {'id': 326, 'first_name': 'Will', 'last_name': 'Cur'}}, {'name': 'test2', 'age': {'id': 751, 'first_name': 'Will', 'last_name': 'Mc'}}]
>> df = spark.createDataFrame(rdd)
>> df.collect()
[Row(age={'last_name': None, 'first_name': None, 'id': 326}, name='test'), Row(age={'last_name': None, 'first_name': None, 'id': 751}, name='test2')]
For complex data structures, Spark might have difficulty in inferring the schema from the RDD, so you can instead provide a schema to make sure that the conversion is done properly:
df = spark.createDataFrame(
rdd,
'name string, age struct<id:int, first_name:string, last_name:string>'
)
df.collect()
# [Row(name='test', age=Row(id=326, first_name='Will', last_name='Cur')),
# Row(name='test2', age=Row(id=751, first_name='Will', last_name='Mc'))]

cerberus - how to validate arbitrary dict keys?

I have read issues here and here using keysrules and valuesrules but I've only seen them validate nested not root. I'd like to valid the top level root dict keys.
schema = {
'any_arbitrary_str': {
'type': 'dict',
'keysrules': {'type': 'string'},
'valuesrules': {'type': 'integer'},
},
}
v = Validator(schema)
v.validate({'test': {'a': 1, 'b': 2}})
print(v.errors)
In this example, I'd like to just validate that schema is dict of str: Dict[str, int] where the keys can be any arbitrary string.
I'm not sure I'm using it right docs, this fails with cerberus.schema.SchemaError: {'any_arbitrary_str': [{'keysrules': ['unknown rule'], 'valuesrules': ['unknown rule']}]} but it's still looking for any_arbitrary_str instead of any string also.
You can just nest it. Not pretty, but works. I have not found a more elegant solution yet.
schema = {
'document': {
'type': 'dict',
'keysrules': {'type': 'string'},
'valuesrules': {
'type': 'dict',
'keysrules': {'type': 'string'},
'valuesrules': {'type': 'integer'},
},
},
}
v = Validator(schema)
document_to_test = {'test': {'a': 1, 'b': 2}}
v.validate({'document': document_to_test})
print(v.errors)

How to convert nested Dictionary to JSON string

I try to convert a nested dictionary to JSON string
a = {'default': {'version': 1.0, 'db': 'mangodb', 'uuid': 'eaada7dc-ec30-4548-a080-c4f70293202a'}, 'temperatures': [{1: 50}, {2: 100}]}
a_json = json.dumps(a['temperature'])
print(a_json)
I was expecting to have
{1: 50}, {2: 100}, but when I executed this code, I had [[null, {"1": 50}], {"2": 100}]]
How can I get a result without this null?
Something like the following should work:
import json
a = {'default': {'version': 1.0, 'db': 'mangodb', 'uuid': 'eaada7dc-ec30-4548-a080-c4f70293202a'}, 'temperatures': [{1: 50}, {2: 100}]}
with open('out.json', mode='w+') as f:
a_json = json.dump(a['temperatures'], f)
Also, in case you don't want to use an output file:
import json
a = {'default': {'version': 1.0, 'db': 'mangodb', 'uuid': 'eaada7dc-ec30-4548-a080-c4f70293202a'}, 'temperatures': [{1: 50}, {2: 100}]}
a_json = json.dumps(a['temperatures'])
print(a_json)
I have tested both samples and they appear to be working just fine.

Cerberus: Use "required" fields with custom validator

I have validation rules in Cerberus that require a custom validator. When accessing fields in self.document, I have to also validate those fields are present, even if using the "required" flag. I am looking for a way for the "required" flag to handle this for me.
For example, say I have a dictionary named data with arrays a and b and the stipulations that both a and b are required and that len(a) == len(b).
# Schema
schema = {'data':
{'type': 'dict',
'schema': {'a': {'type': 'list',
'required': True,
'length_b': True},
'b': {'type': 'list',
'required': True}}}}
# Validator
class myValidator(cerberus.Validator):
def _validate_length_b(self, length_b, field, value):
"""Validates a field has the same length has b"""
if length_b:
b = self.document.get('b')
if not len(b) == len(value):
self._error(field, 'is not equal to length of b array')
This works fine if a and b are present:
good = {'data': {'a': [1, 2, 3],
'b': [1, 2, 3]}}
v = myValidator()
v.validate(good, schema)
# True
bad = {'data': {'a': [1, 2, 3],
'b': [1, 3]}}
v.validate(bad, schema)
# False
v.errors
# {'data': [{'a': ['is not equal to length of b array']}]}
However, if b is missing, it returns a TypeError from len().
very_bad = {'data': {'a': [1, 2, 3]}}
v.validate(very_bad, schema)
# TypeError: object of type 'NoneType' has no len()
How can I get validate to return False instead (as b is not present)? My desired output is below:
v.validate(very_bad, schema)
# False
v.errors
# {'data': ['b': ['required field']]}
Taking Validating that two params have same amount elements using Cerberus as inspiration, could do:
schema = {'data':
{'type': 'dict',
'schema': {'a': {'type': 'list',
'required': True,
'match_length': 'b'},
'b': {'type': 'list',
'required': True}}}}
class MyValidator(cerberus.Validator):
def _validate_match_length(self, other, field, value):
if other not in self.document:
return False
elif len(value) != len(self.document[other]):
self._error(field,
"Length doesn't match field %s's length." % other)
Then:
v = MyValidator(schema)
good = {'data': {'a': [1, 2, 3],
'b': [1, 2, 3]}}
v.validate(good)
-> True
bad = {'data': {'a': [1, 2, 3],
'b': [1, 3]}}
v.validate(bad)
-> False
v.errors
-> {'data': [{'a': ["Length doesn't match field b's length."]}]}
very_bad = {'data': {'a': [1, 2, 3]}}
v.validate(very_bad)
-> False
v.errors
-> {'data': [{'b': ['required field']}]}

cerberus: Validate an optional field occurs at least once

I'm using cerberus to validate data. One of my fields is optional - it doesn't need to be present for every item. However, the key must be populated at least once across the entire data array.
As an example, say I want to validate the key 'c' occurs in at least one dictionary in my data list:
from cerberus import Validator
has_c = {'data': [{'a': 1, 'b': 2}, {'b': 2}, {'c': 3}]}
no_c = {'data': [{'a': 1, 'b': 2}, {'a': 1}]}
schema = {'data':
{'type': 'list',
'schema': {
'type': 'dict',
'schema': {
'a': {'required': True},
'b': {'required': True},
'c': {'required': False, 'at_least_one': True}
}
}
}
}
class MyValidator(Validator) # Some fancy code...
....
v = MyValidator()
v.validate(has_c, schema) # Passes
v.validate(no_c, schema) # Fails
This seems doable outside of cerberus, but I'd like to keep the method in my validator if possible.
If you want the method to be in the Validator subclass, then you will want to create a custom rule just like you were thinking.
from cerberus import Validator
test_with_c = {'data': [{'a': 1, 'b': 2}, {'b': 2}, {'c': 3}]}
test_with_no_c = {'data': [{'a': 1, 'b': 2}, {'a': 1}]}
class MyValidator(Validator):
def _validate_has_c(self, has_c, field, value):
seen_c = False
for v in value:
if "c" in v:
seen_c = True
if has_c and not seen_c:
self._error(field, "Must contain a 'c' key")
schema = {
"data": {
"type": "list",
"has_c": True
}
}
v = MyValidator(schema)
print(v(test_with_c), v.errors)
print(v(test_with_no_c), v.errors)
Running this will yield the results you want with respect to looking for a c key in one of the elements. Running that code yields
True {}
False {'data': ["Must contain a 'c' key"]}

Categories

Resources