Failing to parse complex JSON using Python and Marshmallow

Failing to parse complex JSON using Python and Marshmallow - python

I am using Marshmallow to create a mapper for a JSON file. Following are the details:
My JSON File:
{
"version": "3.0",
"name": "A1",
"request": {
"startdate": "26022022",
"enddate": "26022022",
"records": 1000
},
"ranking": {
"90": {
"name": "N1",
"class1": "C1"
},
"98": {
"name": "N2",
"class1": "C2"
},
"86": {
"name": "N3",
"class1": "C3"
}
}
}
My mapper class:
class RequestMapper(Schema):
startdate = fields.String()
enddate = fields.String()
records = fields.Int()
class Ranking(Schema):
name = fields.String()
class1 = fields.String()
class RankingMapper(Schema):
rank = fields.Nested(Ranking(), dataKey = fields.Int)
class SampleSchema(Schema):
name = fields.Str()
request = fields.Nested(RequestMapper())
ranking = fields.Nested(RankingMapper())
Code to call Mapper:
print("\n\nOutput using mapper")
pprint(mapper.SampleSchema().dump(data), indent=3)
print("\n\n")
Following is the output:
Output using mapper
{ 'name': 'A1',
'ranking': {},
'request': {'enddate': '26022022', 'records': 1000, 'startdate': '26022022'}}
I am not getting any data for ranking as datakey [90, 98, 86...] are dynamic and am not sure how to create mapper for dynamic keys please.
Any inputs will be helpful.
Thank you

When nesting schemas, pass the class NAME, not a class instance:
class RankingMapper(Schema):
rank = fields.Nested(Ranking, dataKey = fields.Int)
class SampleSchema(Schema):
name = fields.Str()
request = fields.Nested(RequestMapper)
ranking = fields.Nested(RankingMapper)

Related

How to get separately column from sqlalchemy relationship using pydantic schema

I have 4 tables: Hardware, SoftwareName, SoftwareVersion, and Software.
The Software table has an one-to-many relationship with SoftwareName table and SoftwareVersion table. Finally, the Hardware model has an one-to-many relationship with Software table.
I'm trying to get just a specific column from a model relationship using Pydantic Schema.
Now I'm getting this output:
[
{
"id": 1,
"hostname": "hostname2",
"softwares": [
{
"id": 1,
"software_name": {
"id": 1,
"name": "nginx"
},
"software_version": {
"id": 1,
"version": "2.9"
}
},
{
"id": 2,
"software_name": {
"id": 2,
"name": "vim"
},
"software_version": {
"id": 2,
"version": "0.3"
}
},
{
"id": 3,
"software_name": {
"id": 3,
"name": "apache"
},
"software_version": {
"id": 3,
"version": "1.0"
}
}
]
}
]
But what I expect is this output:
[
{
"id": 1,
"hostname": "hostname2",
"softwares": [
{
"id": 1,
"name": "nginx",
"version": "2.9"
},
{
"id": 2,
"name": "vim",
"version": "0.3"
},
{
"id": 3,
"name": "apache",
"version": "1.0"
}
]
}
]
I have the file main.py:
import uvicorn
from typing import Any, Iterator, List, Optional
from faker import Faker
from fastapi import Depends, FastAPI
from pydantic import BaseModel
from sqlalchemy import Column, ForeignKey, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, sessionmaker, relationship
from faker.providers import DynamicProvider
software_name = DynamicProvider(
provider_name="software_name",
elements=["bash", "vim", "vscode", "nginx", "apache"],
)
software_version = DynamicProvider(
provider_name="software_version",
elements=["1.0", "2.9", "1.1", "0.3", "2.0"],
)
hardware = DynamicProvider(
provider_name="hardware",
elements=["hostname1", "hostname2", "hostname3", "hostname4", "hostname5"],
)
fake = Faker()
# then add new provider to faker instance
fake.add_provider(software_name)
fake.add_provider(software_version)
fake.add_provider(hardware)
engine = create_engine("sqlite:///.db", connect_args={"check_same_thread": False})
SessionLocal = sessionmaker(autocommit=True, autoflush=True, bind=engine)
Base = declarative_base(bind=engine)
class Software(Base):
__tablename__ = 'software'
id = Column(Integer, primary_key=True)
hardware_id = Column(Integer, ForeignKey('hardware.id'))
name_id = Column(Integer, ForeignKey('software_name.id'))
version_id = Column(Integer, ForeignKey('software_version.id'))
software_name = relationship('SoftwareName', backref='software_name')
software_version = relationship('SoftwareVersion',
backref='software_version')
class SoftwareName(Base):
__tablename__ = 'software_name'
id = Column(Integer, primary_key=True)
name = Column(String)
class SoftwareVersion(Base):
__tablename__ = 'software_version'
id = Column(Integer, primary_key=True)
version = Column(String)
class Hardware(Base):
__tablename__ = "hardware"
id = Column(Integer, primary_key=True, autoincrement=True)
hostname = Column(String, nullable=False)
softwares = relationship(Software)
Base.metadata.drop_all()
Base.metadata.create_all()
class BaseSchema(BaseModel):
id: int
class Config:
orm_mode = True
class SoftwareNameSchema(BaseSchema):
name: str
class SoftwareVersionSchema(BaseSchema):
version: str
class SoftwareSchema(BaseSchema):
software_name: SoftwareNameSchema
software_version: SoftwareVersionSchema
class HardwareOut(BaseSchema):
hostname: str
softwares: List[SoftwareSchema]
app = FastAPI()
#app.on_event("startup")
def on_startup() -> None:
session = SessionLocal()
for _ in range(10):
software_list = []
for _ in range(3):
sn = SoftwareName(name=fake.software_name())
sv = SoftwareVersion(version=fake.software_version())
s = Software(software_name=sn, software_version=sv)
software_list.append(s)
h = Hardware(hostname=fake.hardware(), softwares=software_list)
session.add(h)
session.flush()
session.close()
def get_db() -> Iterator[Session]:
db = SessionLocal()
try:
yield db
finally:
db.close()
#app.get("/hardwares", response_model=List[HardwareOut])
def get_hardwares(db: Session = Depends(get_db)) -> Any:
return [HardwareOut.from_orm(hardware) for hardware in db.query(Hardware).all()]
How can I change the HardwareOut Schema to return what I expect?

I finally got the answer I wanted.
I added 2 changes to get it:
Use the Union type from typing lib for the attributes software_name e software_version like that:
Add a Pydantic validator for each field to change the returned value, like that:
from typing import Union
from pydantic import validator
...
class SoftwareSchema(BaseSchema):
software_name: Union[str, SoftwareNameSchema]
software_version: Union[str, SoftwareVersionSchema]
#validator('software_name')
def name_to_str(cls, v, values, **kwargs):
return v.name if not isinstance(v, str) else v
#validator('software_version')
def version_to_str(cls, v, values, **kwargs):
return v.version if not isinstance(v, str) else v
...
And the answer was this:
[
{
"id": 1,
"hostname": "hostname2",
"softwares": [
{
"id": 1,
"software_name": "nginx",
"software_version": "2.9"
},
{
"id": 2,
"software_name": "vim",
"software_version": "0.3"
},
{
"id": 3,
"software_name": "apache",
"software_version": "1.0"
}
]
}
]
update:
As an improvement, I add an alias for each attribute for a better semantic response. So, I change software_name to name and software_version to version. Like this:
from typing import Union
from pydantic import validator
...
class SoftwareSchema(BaseSchema):
software_name: Union[str, SoftwareNameSchema] = Field(None, alias="name")
software_version: Union[str, SoftwareVersionSchema] = Field(None, alias="version")
#validator('software_name')
def name_to_str(cls, v, values, **kwargs):
return v.name if not isinstance(v, str) else v
#validator('software_version')
def version_to_str(cls, v, values, **kwargs):
return v.version if not isinstance(v, str) else v
...

CloudTrail logs in DyanmoDB using Boto

I am working on a boto script that will create an IAM Policy and store it's attributes in a Dynamodb table. I have a python function which calls from another file for attributes like region, instance_type, ebs_volume_size, meta_template_name, start_time, end_time. While writing the code for Cloudtrail I am getting an error for putItem saying
"An error occurred (ValidationException) when calling the CreateTable operation: Invalid KeySchema: Some index key attribute have no definition".
This is my code I am not sure what is wrong.
import jmespath
import boto3
import sys
import json
import time
import meta_templates
from jinja2 import Template
iam = boto3.client('iam')
sts = boto3.client('sts')
ec2 = boto3.resource('ec2')
cloudtrail = boto3.client('cloudtrail')
s3 = boto3.client('s3')
sqs = boto3.client('sqs')
lambd = boto3.client('lambda')
dynamodb = boto3.resource('dynamodb')
###########################
##### Global variables ####
###########################
region="us-east-2"
instance_type="t2.micro"
ebs_volume_size="20"
meta_template_name="ec2_policy_meta_template"
###############################
start_time_1 = input("What's the start time")
end_time1 = input("What's the end time")
def create_aws_iam_policy_template(**kwargs):
template_data = {}
template_data["region"] = kwargs.get('region')
template_data["start_time"] = kwargs.get('end_time')
template_data["end_time"] = kwargs.get('start_time')
template_data["instance_types"] = kwargs.get('instance_type')
template_data["ebs_volume_size"] = kwargs.get('ebs_volume_size')
template_data["meta_template_name"] = kwargs.get('meta_template_name')
meta_template_dict = getattr(meta_templates, template_data["meta_template_name"])
meta_template_json = json.dumps(meta_template_dict)
template_json = Template(meta_template_json).render(template_data)
return template_json
template_json = create_aws_iam_policy_template(
region=region,
instance_type=instance_type,
ebs_volume_size=ebs_volume_size,
meta_template_name=meta_template_name,
start_time = start_time_1,
end_time = end_time1
)
print(template_json)
#Create S3 Bucket for CloudTrail
# Create a bucket policy
bucket_name = 'goodbucket3'
bucket_policy = {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {"Service": "cloudtrail.amazonaws.com"},
"Action": "s3:GetBucketAcl",
"Resource": f"arn:aws:s3:::{bucket_name}"
},
{
"Effect": "Allow",
"Principal": {"Service": "cloudtrail.amazonaws.com"},
"Action": "s3:PutObject",
"Resource": f"arn:aws:s3:::{bucket_name}/AWSLogs/562922379100/*",
"Condition": {"StringEquals": {"s3:x-amz-acl": "bucket-owner-full-control"}}
}
]
}
# Convert the policy from JSON dict to string
bucket_policy = json.dumps(bucket_policy)
# Set the new policy
s3.put_bucket_policy(Bucket='goodbucket3', Policy=bucket_policy)
result = s3.get_bucket_policy(Bucket='goodbucket3')
logs = cloudtrail.create_trail(
Name='GoodTrail',
S3BucketName='goodbucket3',
)
response = cloudtrail.start_logging(
Name= f"arn:aws:cloudtrail:us-east-1:562922379100:trail/GoodTrail"
)
table =dynamodb.create_table(
TableName='GoodTable',
KeySchema=[
{
'AttributeName': 'Content',
'KeyType': 'HASH' # Partition key
},
{
'AttributeName': 'Details',
'KeyType': 'HASH' # Sort key
}
],
AttributeDefinitions=[
{
"AttributeName": "Content",
"AttributeType": "S"
}
],
ProvisionedThroughput={
"ReadCapacityUnits": 1,
"WriteCapacityUnits": 1
}
)
time.sleep(20)
table = dynamodb.Table('GoodTable')
response = table.put_item(
Item= {
'Content': 'Volume Size',
'Details': f'{ebs_volume_size}',
}
)
response = table.put_item(
Item= {
'Content': 'Instance Type',
'Details': f'{instance_type}',
}
)
response = table.put_item(
Item= {
'Content': 'Region',
'Details': f'{region}',
}
)

from the documentation PutItem
When you add an item, the primary key attributes are the only required attributes. Attribute values cannot be null.
You declared your Schema with S3BucketName as the Partition key.
],
KeySchema=[
{
"AttributeName": "S3BucketName",
"KeyType": "HASH"
}
],
So when you add an item you have to provide that too.
For example:
KeySchema=[
{
'AttributeName': 'year',
'KeyType': 'HASH' # Partition key
},
{
'AttributeName': 'title',
'KeyType': 'RANGE' # Sort key
}
]
response = table.put_item(
Item={
'year': year,
'title': title,
'info': {
'plot': plot,
'rating': rating
}
}
)
Getting Started Developing with Python and DynamoDB

How to create the given jSON format from a pandas dataframe?

The data looks like this :
The expected Json fomat is like this
{
"DataExtractName": "SalesDataExtract",
"BusinessName" : {
"InvoiceDate": {
"SourceSystem": {
"MYSQL" : "Invc_Dt",
"CSV" : "Invc_Date"
},
"DataType": {
"MYSQL" : "varchar",
"CSV" : "string"
}
},
"Description": {
"SourceSystem": {
"MYSQL" : "Prod_Desc",
"CSV" : "Prod_Descr"
},
"DataType": {
"MYSQL" : "varchar",
"CSV" : "string"
}
}
}
},
{
"DataExtractName": "DateDataExtract",
"BusinessName" : {
"InvoiceDate": {
"SourceSystem": {
"MYSQL" : "Date"
},
"DataType": {
"MYSQL" : "varchar"
}
}
}
}
How do i achieve this using python dataframes? Or do i need to write some script to make the data like this?
Note
I've tried using -
df.to_json
df.to_dict

With so many nested structures, you should use marshmallow. It is built with your use case in mind. Please check out the excellent documentation: https://marshmallow.readthedocs.io/en/stable/ . All you need is the masic usage.
It is a lot of code, but better be explicit than clever. I am sure a shorter solution exists, but it is probably unmaintainable. Also I had to build your dataframe. Please provide it in a data format next time.
import pandas as pd
import marshmallow as ma
# build test data
df = pd.DataFrame.from_records([
['InvoiceDate', 'MYSQL', 'Invc_Dt', 'varchar', 'SalesDataExtract'],
['InvoiceDate', 'CSV', 'Invc_Date', 'string', 'SalesDataExtract'],
['Description', 'MYSQL', 'Prod_Descr', 'varchar', 'SalesDataExtract'],
['Description', 'CSV', 'Prod_Descr', 'string', 'SalesDataExtract'],
['InvoiceDate', 'MYSQL', 'Date', 'varchar', 'DateDataExtract'],]
)
df.columns = ['BusinessName', 'SourceSystem', 'FunctionalName', 'DataType', 'DataExtractName']
# define marshmallow schemas
class SourceSystemTypeSchema(ma.Schema):
MYSQL = ma.fields.String()
CSV = ma.fields.String()
class DataTypeSchema(ma.Schema):
MYSQL = ma.fields.String()
CSV = ma.fields.String()
class InvoiceDateSchema(ma.Schema):
InvoiceDate = ma.fields.Nested(SourceSystemTypeSchema())
DataType = ma.fields.Nested(DataTypeSchema())
class DescriptionSchema(ma.Schema):
SourceSystem = ma.fields.Nested(SourceSystemTypeSchema())
DataType = ma.fields.Nested(DataTypeSchema())
class BusinessNameSchema(ma.Schema):
InvoiceDate = ma.fields.Nested(InvoiceDateSchema())
Description = ma.fields.Nested(DescriptionSchema())
class DataSchema(ma.Schema):
DataExtractName = ma.fields.String()
BusinessName = ma.fields.Nested(BusinessNameSchema())
# building json
result = []
mask_business_name_invoicedate = df.BusinessName == 'InvoiceDate'
mask_business_name_description = df.BusinessName == 'Description'
for data_extract_name in set(df['DataExtractName'].to_list()):
mask_data_extract_name = df.DataExtractName == data_extract_name
# you need these two helper dfs to get the dictionaries
df_source_system = df[mask_data_extract_name & mask_business_name_invoicedate].set_index('SourceSystem').to_dict(orient='dict')
df_description = df[mask_data_extract_name & mask_business_name_description].set_index('SourceSystem').to_dict(orient='dict')
# all dictionaries are defined, so you can use your schemas
source_system_type = SourceSystemTypeSchema().dump(df_source_system['FunctionalName'])
data_type = DataTypeSchema().dump(df_source_system['DataType'])
source_system = SourceSystemTypeSchema().dump(df_description['FunctionalName'])
invoice_date = InvoiceDateSchema().dump({'SourceSystemType': source_system_type, 'DataType': data_type})
description = DescriptionSchema().dump({'SourceSystem': source_system, 'DataType': data_type})
business_name = BusinessNameSchema().dump({'InvoiceDate': invoice_date, 'Description': description})
data = DataSchema().dump({'DataExtractName': data_extract_name, 'BusinessName': business_name})
# end result
result.append(data)
Now,
ma.pprint(result)
returns
[{'BusinessName': {'Description': {'DataType': {'CSV': 'string',
'MYSQL': 'varchar'},
'SourceSystem': {'CSV': 'Prod_Descr',
'MYSQL': 'Prod_Descr'}},
'InvoiceDate': {'DataType': {'CSV': 'string',
'MYSQL': 'varchar'}}},
'DataExtractName': 'SalesDataExtract'},
{'BusinessName': {'Description': {'DataType': {'MYSQL': 'varchar'},
'SourceSystem': {}},
'InvoiceDate': {'DataType': {'MYSQL': 'varchar'}}},
'DataExtractName': 'DateDataExtract'}]

Django Rest framework post ForeignKey

I am using DRF to update data. Its works good, but I am struggling how to update foreign keys.
{
"name": "Ready Or Not",
"releases": [
{
"platform": {
"name": "test"
},
"date": "2019-10-02T11:38:18Z"
}
]
},
This is a response of my API.
But I want to update this 'releases' information as well.
To sent I have this. If the 'platform' name doesnt exist it should also create one. How do I do this?
headers = {
'Authorization': 'Token ' + ss_token,
}
data = {
"releases": [
{
"platform": {
"name": "wanttoupdate"
},
"date": "2019-10-02T11:38:18Z"
},
]
}
source = Source.objects.all().first()
url = source.url + str(947) + '/'
response = requests.patch(url, headers=headers, data=data)
My models:
class Game(models.Model):
name = models.CharField(max_length=255)
class Platform(models.Model):
name = models.CharField(max_length=255)
class Release(models.Model):
platform = models.ForeignKey(Platform, on_delete=models.CASCADE)
game = models.ForeignKey(Game, related_name='releases', on_delete=models.CASCADE)
date = models.DateTimeField()

To post a ForeignKey instance , you have to use its id, for example in you case
if request.method == 'POST':
platform = request.POST['platform'] # pass your html name parameter
release_obj = Release(platform_id = platform)
release_obj.save()

Grouping object properties into new dictionary within Marshmallow schema

I am trying to serialize an object with Marshmallow in such a way that "related" properties are grouped together into a single dictionary that does not exist on the original object. My code:
from marshmallow import Schema, fields, pprint
import json
class StatsSchema(Schema):
population = fields.Int()
rating = fields.Int()
class AnimalSchema(Schema):
name = fields.Str()
features = fields.List(fields.Str())
stats = fields.Nested(StatsSchema)
dog = {
'name':'dog',
'features': ['tongue', 'wet nose'],
'population': 200,
'rating': 10
}
animal_schema = AnimalSchema()
data, errors = animal_schema.dump(dog)
print(json.dumps(data, indent=2))
Actual result:
{
"features": [
"tongue",
"wet nose"
],
"name": "dog"
}
Desired result:
{
"features": [
"tongue",
"wet nose"
],
"name": "dog",
"stats": {"population": 500, "rating": 10}
}
I understand that the "stats" key is missing from the output because it is not on the original object, but I am not sure how to specify that Marshmallow should create the new "stats" key as a new dictionary using the object.

I found one possible way to create the inner dictionary. Not sure if it is the only/best method:
class AnimalSchema(Schema):
name = fields.Str()
features = fields.List(fields.Str())
stats = fields.Method('get_stats')
def get_stats(self, post):
data, err = StatsSchema().dump(post)
return data

This is discussed in https://github.com/marshmallow-code/marshmallow/issues/940.
You could do that
class AnimalSchema(Schema):
name = fields.Str()
features = fields.List(fields.Str())
stats = fields.Nested(StatsSchema, dump_only=True)
class Animal:
[...]
#property
def stats(self):
return {'population': self.population, 'rating': self.rating}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Failing to parse complex JSON using Python and Marshmallow - python

When nesting schemas, pass the class NAME, not a class instance: class RankingMapper(Schema): rank = fields.Nested(Ranking, dataKey = fields.Int) class SampleSchema(Schema): name = fields.Str() request = fields.Nested(RequestMapper) ranking = fields.Nested(RankingMapper)

Related

How to get separately column from sqlalchemy relationship using pydantic schema

CloudTrail logs in DyanmoDB using Boto

How to create the given jSON format from a pandas dataframe?

Django Rest framework post ForeignKey

Grouping object properties into new dictionary within Marshmallow schema

Categories

Resources