I have a new project where I obtain JSON data back from a REST API - I'm trying to parse this data to csv pipe delimited to import to our legacy software
I can't seem to get all the value pairs parsed properly - this is my first exposure to JSON and I've tried so many things but only getting a little right at a time
I have used Python and can get some items that I need but not the whole JSON tree - it comes across as a list and has some dictionaries and lists in it as well
I know my code is incomplete and just looking for someone to point me in the right direction on what tools in python can get the job done
import json
import csv
with open('tenants.json') as access_json:
read_content = json.load(access_json)
for rm_access in read_content:
rm_data = rm_access
print(rm_data)
contacts_data = rm_data['Contacts']
leases_data = rm_data['Leases']
udfs_data = rm_data['UserDefinedValues']
for contacts_access in contacts_data:
rm_contacts = contacts_access
UPDATED:
import pandas as pd
with open('tenants.json') as access_json:
read_content = json.load(access_json)
for rm_access in read_content:
rm_data = rm_access
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 150)
TenantID = []
TenantDisplayID = []
Name = []
FirstName = []
LastName = []
WebMessage = []
Comment = []
RentDueDay = []
RentPeriod = []
FirstContact = []
PropertyID = []
PostingStartDate = []
CreateDate = []
CreateUserID = []
UpdateDate = []
UpdateUserID = []
Contacts = []
for rm_access in read_content:
rm_data = rm_access
TenantID.append(rm_data["TenantID"])
TenantDisplayID.append(rm_data["TenantDisplayID"])
Name.append(rm_data["Name"])
FirstName.append(rm_data["FirstName"])
LastName.append(rm_data["LastName"])
WebMessage.append(rm_data["WebMessage"])
Comment.append(rm_data["Comment"])
RentDueDay.append(rm_data["RentDueDay"])
RentPeriod.append(rm_data["RentPeriod"])
# FirstContact.append(rm_data["FirstContact"])
PropertyID.append(rm_data["PropertyID"])
PostingStartDate.append(rm_data["PostingStartDate"])
CreateDate.append(rm_data["CreateDate"])
CreateUserID.append(rm_data["CreateUserID"])
UpdateUserID.append(rm_data["UpdateUserID"])
Contacts.append(rm_data["Contacts"])
df = pd.DataFrame({"TenantID":TenantID,"TenantDisplayID":TenantDisplayID, "Name"
: Name,"FirstName":FirstName, "LastName": LastName,"WebMessage": WebMessage,"Com
ment": Comment, "RentDueDay": RentDueDay, "RentPeriod": RentPeriod, "PropertyID"
: PropertyID, "PostingStartDate": PostingStartDate,"CreateDate": CreateDate, "Cr
eateUserID": CreateUserID,"UpdateUserID": UpdateUserID,"Contacts": Contacts})
print(df)
Here is sample of the file
[
{
"TenantID": 115,
"TenantDisplayID": 115,
"Name": "Jane Doe",
"FirstName": "Jane",
"LastName": "Doe",
"WebMessage": "",
"Comment": "",
"RentDueDay": 1,
"RentPeriod": "Monthly",
"FirstContact": "2015-11-01T15:30:00",
"PropertyID": 17,
"PostingStartDate": "2010-10-01T00:00:00",
"CreateDate": "2014-04-16T13:35:37",
"CreateUserID": 1,
"UpdateDate": "2017-03-22T11:31:48",
"UpdateUserID": 1,
"Contacts": [
{
"ContactID": 128,
"FirstName": "Jane",
"LastName": "Doe",
"MiddleName": "",
"IsPrimary": true,
"DateOfBirth": "1975-02-27T00:00:00",
"FederalTaxID": "111-11-1111",
"Comment": "",
"Email": "jane.doe#mail.com",
"License": "ZZT4532",
"Vehicle": "BMW 3 Series",
"IsShowOnBill": true,
"Employer": "REW",
"ApplicantType": "Applicant",
"CreateDate": "2014-04-16T13:35:37",
"CreateUserID": 1,
"UpdateDate": "2017-03-22T11:31:48",
"AnnualIncome": 0.0,
"UpdateUserID": 1,
"ParentID": 115,
"ParentType": "Tenant",
"PhoneNumbers": [
{
"PhoneNumberID": 286,
"PhoneNumberTypeID": 2,
"PhoneNumber": "703-555-5610",
"Extension": "",
"StrippedPhoneNumber": "7035555610",
"IsPrimary": true,
"ParentID": 128,
"ParentType": "Contact"
}
]
}
],
"UserDefinedValues": [
{
"UserDefinedValueID": 1,
"UserDefinedFieldID": 4,
"ParentID": 115,
"Name": "Emerg Contact Name",
"Value": "Terry Harper",
"UpdateDate": "2016-01-22T15:41:53",
"FieldType": "Text",
"UpdateUserID": 2,
"CreateUserID": 2
},
{
"UserDefinedValueID": 174,
"UserDefinedFieldID": 5,
"ParentID": 115,
"Name": "Emerg Contact Phone",
"Value": "703-555-3568",
"UpdateDate": "2016-01-22T15:42:03",
"FieldType": "Text",
"UpdateUserID": 2,
"CreateUserID": 2
}
],
"Leases": [
{
"LeaseID": 115,
"TenantID": 115,
"UnitID": 181,
"PropertyID": 17,
"MoveInDate": "2010-10-01T00:00:00",
"SortOrder": 1,
"CreateDate": "2014-04-16T13:35:37",
"UpdateDate": "2017-03-22T11:31:48",
"CreateUserID": 1,
"UpdateUserID": 1
}
],
"Addresses": [
{
"AddressID": 286,
"AddressTypeID": 1,
"Address": "14393 Montgomery Road Lot #102\r\nCincinnati, OH 45122",
"Street": "14393 Montgomery Road Lot #102",
"City": "Cincinnati",
"State": "OH",
"PostalCode": "45122",
"IsPrimary": true,
"ParentID": 115,
"ParentType": "Tenant"
}
],
"OpenReceivables": [],
"Status": "Current"
},
Not all tenants will have all elements which is also tricky
I need the data from the top where there is TenantID, TenantDisplayID, etc
I also need the data from the Contacts, PhoneNumbers, Leases, etc values
Each line should be static so if it doesn't have certain tags then I'd like a Null or None so it would look like
TentantID|TenantDisplayID|FirstName….etc so each line has same number of fields
Something like this should work:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100000)
TenantID = []
TenantDisplayID = []
Name = []
FirstName = []
LastName = []
WebMessage = []
Comment = []
RentDueDay = []
RentPeriod = []
FirstContact = []
PropertyID = []
PostingStartDate = []
CreateDate = []
CreateUserID = []
UpdateDate = []
UpdateUserID = []
Contacts = []
for rm_access in read_content:
rm_data = rm_access
print(rm_data)
TenantID.append(rm_data["TenantID"])
TenantDisplayID.append(rm_data["TenantDisplayID"])
Name.append(rm_data["Name"])
FirstName.append(rm_data["FirstName"])
LastName.append(rm_data["LastName"])
WebMessage.append(rm_data["WebMessage"])
Comment.append(rm_data["Comment"])
RentDueDay.append(rm_data["RentDueDay"])
RentPeriod.append(rm_data["RentPeriod"])
FirstContact.append(rm_data["FirstContact"])
PropertyID.append(rm_data["PropertyID"])
PostingStartDate.append(rm_data["PostingStartDate"])
CreateDate.append(rm_data["CreateDate"])
CreateUserID.append(rm_data["CreateUserID"])
UpdateUserID.append(rm_data["UpdateUserID"])
Contacts.append(rm_data["Contacts"])
df = pd.DataFrame({"TenantID":TenantID,"TenantDisplayID":TenantDisplayID, "Name": Name,
"FirstName":FirstName, "LastName": LastName,"WebMessage": WebMessage,
"Comment": Comment, "RentDueDay": RentDueDay, "RentPeriod": RentPeriod,
"FirstContact": FirstContact, "PropertyID": PropertyID, "PostingStartDate": PostingStartDate,
"CreateDate": CreateDate, "CreateUserID": CreateUserID,"UpdateUserID": UpdateUserID,
"Contacts": Contacts})
print(df)
The General Problem
The problem with this task (and other similar ones) is not just how to create an algorithm - I am sure you will theoretically be able to solve this with a (not so) nice amount of nested for-loops. The problem is to organise the code in a way that you don't get a headache - i.e. in a way that you can fix bugs easily, that you can write unittests, that you can understand the code easily from reading it (in six months from now) and that you can easily change your code in case you need to do so.
I do not know anybody who does not make mistakes when wrapping their head around a deeply nested structure. And chasing for bugs in a code which is heavily nested because it mirrors the nested structure of the data, can be quite frustrating.
The Quick (and most probably: Best) Solution
Rely on packages that are made for your exact usecase, such as
https://github.com/cwacek/python-jsonschema-objects
In case you have a formal definition of the API schema, you could use packages for that. If, for instance, your API has a Swagger schema definition, you cann use swagger-py (https://github.com/digium/swagger-py) to get your JSON response into Python objects.
The Principle Solution: Object Oriented Programming and Recursion
Even if there might be some libraries for your concrete use case, I would like to explain the principle of how to deal with "that kind" of tasks:
A good way to organise code for this kind of problem is using Object Oriented Programming. The nesting hassle can be laid out much clearer by making use of the principle of recursion. This also makes it easier to chabge the code, in case the JSON schema of your API response changes for any reasons (an update of the API, for instance). In your case I would suggest you create something like the following:
class JsonObject:
"""Parent Class for any Object that will be retrieved from the JSON
and potentially has nested JsonObjects inside.
This class takes care of parsing the json into python Objects and deals
with the recursion into the nested structures."""
primitives = []
json_objects = {
# For each class, this dict defines all the "embedded" classes which
# live directly "under" that class in the nested JSON. It will have the
# following structure:
# attribute_name : class
# In your case the JSON schema does not have any "single" objects
# in the nesting strcuture, but only lists of nested objects. I
# still , to demonstrate how you would do it in case, there would be
# single "embedded"
}
json_object_lists = {
# For each class, this dict defines all the "embedded" subclasses which
# are provided in a list "under" that class in the nested JSON.
# It will have the following structure:
# attribute_name : class
}
#classmethod
def from_dict(cls, d: dict) -> "JsonObject":
instance = cls()
for attribute in cls.primitives:
# Here we just parse all the primitives
instance.attribute = getattr(d, attribute, None)
for attribute, klass in cls.json_object_lists.items():
# Here we parse all lists of embedded JSON Objects
nested_objects = []
l = getattr(d, attribute, [])
for nested_dict in l:
nested_objects += klass.from_dict(nested_dict)
setattr(instance, attribute, nested_objects)
for attribute, klass in cls.json_objects.items():
# Here we parse all "single" embedded JSON Objects
setattr(
instance,
attribute,
klass.from_dict(getattr(d, attribute, None)
)
def to_csv(self) -> str:
pass
Since you didn't explain how exactly you want to create a csv from the JSON, I didn't implement that method and left this to you. It is also not necessary to explain the overall approach.
Now we have the general Parent class all our specific will inherit from, so that we can apply recursion to our problem. Now we only need to define these concrete structures, according to the JSON schema we want to parse. I got the following from your sample, but you can easily change the things you need to:
class Address(JsonObject):
primitives = [
"AddressID",
"AddressTypeID",
"Address",
"Street",
"City",
"State",
"PostalCode",
"IsPrimary",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {}
class Lease(JsonObject):
primitives = [
"LeaseID",
"TenantID",
"UnitID",
"PropertyID",
"MoveInDate",
"SortOrder",
"CreateDate",
"UpdateDate",
"CreateUserID",
"UpdateUserID",
]
json_objects = {}
json_object_lists = {}
class UserDefinedValue(JsonObject):
primitives = [
"UserDefinedValueID",
"UserDefinedFieldID",
"ParentID",
"Name",
"Value",
"UpdateDate",
"FieldType",
"UpdateUserID",
"CreateUserID",
]
json_objects = {}
json_object_lists = {}
class PhoneNumber(JsonObject):
primitives = [
"PhoneNumberID",
"PhoneNumberTypeID",
"PhoneNumber",
"Extension",
"StrippedPhoneNumber",
"IsPrimary",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {}
class Contact(JsonObject):
primitives = [
"ContactID",
"FirstName",
"LastName",
"MiddleName",
"IsPrimary",
"DateOfBirth",
"FederalTaxID",
"Comment",
"Email",
"License",
"Vehicle",
"IsShowOnBill",
"Employer",
"ApplicantType",
"CreateDate",
"CreateUserID",
"UpdateDate",
"AnnualIncome",
"UpdateUserID",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {
"PhoneNumbers": PhoneNumber,
}
class Tenant(JsonObject):
primitives = [
"TenantID",
"TenantDisplayID",
"Name",
"FirstName",
"LastName",
"WebMessage",
"Comment",
"RentDueDay",
"RentPeriod",
"FirstContact",
"PropertyID",
"PostingStartDate",
"CreateDate",
"CreateUserID",
"UpdateDate",
"UpdateUserID",
"OpenReceivables", # Maybe this is also a nested Object? Not clear from your sample.
"Status",
]
json_object_lists = {
"Contacts": Contact,
"UserDefinedValues": UserDefinedValue,
"Leases": Lease,
"Addresses": Address,
}
json_objects = {}
You might imagine the "beauty" (at least: order) of that approach, which lies in the following: With this structure, we could tackle any level of nesting in the JSON response of your API without additional headache - our code would not deepen its indentation level, because we have separated the nasty nesting into the recursive definition of JsonObjects from_json method. That is why it is much easier now to identify bugs or apply changes to our code.
To finally parse the JSON now into our Objects, you would do something like the following:
import typing
import json
def tenants_from_json(json_string: str) -> typing.Iterable["Tenant"]:
tenants = [
Tenant.from_dict(tenant_dict)
for tenant_dict in json.loads(json_string)
]
return tenants
Important Final Side Note: This is just the basic Principle
My code example is just a very brief introduction into the idea of using objects and recursion to deal with an overwhelming (and nasty) nesting of a structure. The code has some flaws. For instance one should avoid define mutable class variables. And of course the whole code should validate the data it gets from the API. You also might want to add the type of each attribute and represent that correctly in the Python objects (Your sample has integers, datetimes and strings, for instance).
I really only wanted to show you the very principle of Object Oriented Programming here.
I didn't take the time to test my code. So there are probably bugs left. Again, I just wanted to demonstrate the principle.
I'm trying to figure out what would be the best way to create classes in a dynamic manner based on the contents of a JSON file. So for example, here's a snippet from the JSON file:
{
"stuff": [{
"name": "burger",
"aka": ["cheeseburger", "hamburger"]
},
{
"name": "fries",
"aka": ["french fries", "potatoes"]
},
{
"name": "meal",
"items": [{
"name": "burger",
"value": "<burger>"
},
{
"name": "fries",
"value": "<fries>"
}
]
}
]
}
And now based on this JSON, I want classes that represent these objects. So for example, something like:
class Burger:
def __init__(self):
self.name = "burger"
self.aka = ["cheeseburger", "hamburger"]
class Meal:
def __init__(self):
self.name = "meal"
self.burger = Burger()
self.fries = Fries()
So basically, based on that JSON, I want to be able to create classes that represent the same attributes and relationships that we see in the JSON. Any ideas about the best way to approach this would be appreciated!
Assuming json variable contains your json data try this:
for d in json:
name = d.pop('name')
t = type(name, (object,), d)
What it does is to call type, which will create new type in python (exactly the same as if you did class name, which correct name set to content of name variable, with base class object and attributes in d. Variable t will contain class object you want.
I'm having trouble dynamically creating a Python dictionary path to loop through and validate a value. Here's what I'd like to do:
Make API call using Requests 1.0 and store the JSON response in a dict.
response = requests.get(path/to/file.json).json()
The response object will be formatted as follows:
{
"status": "OK",
"items": [
{
"name": "Name 1",
"id": 0,
"address":{
"city": "New York",
}
},
{
"name": "Name 2",
"id": 1,
"address":{
"city": "New York",
}
},
{
"name": "Name 3",
"id": 2,
"address":{
"city": "New York",
}
}]
}
Send the response dict, field and value to a function for validation. The function would take the response object and append the field entry to it to define its path then validate against the value. So in theory it would be:
response[field] = value
The code that I wrote to do this was:
def dynamic_assertion(response, field, value):
i = 0
stations = "response['items']"
count = len(response['items'])
while i < count:
path = '%s[%s]%s' % (stations, i, field)
path = path.strip("")
if path != value:
print type(path)
return False
i += 1
return True
dynamic_assertion(response, "['address']['city']", "New York")
I realize that once I create the path string it is no longer an object. How do I create this in a way that will allow me to keep the response object and append the reference path to traverse through? Is this even possible?!
I think you'd be better off avoiding a single path string in favor of a tuple or list of strings which represent the individual keys in the nested dictionaries. That is, rather than "['address']['city']" being your field argument, you'd pass ("address", "city"). Then you just need a loop to go through the keys and see if the final value is the correct one:
def dynamic_assertion(response, field, value):
for item in response["items"]:
for key in field:
item = item[key] # go deeper into the nested dictionary
if item != value:
return False # raising an exception might be more Pythonic
return True
Example output (given the response dict from the question):
>>> dynamic_assertion(response, ("address", "city"), "New York")
True
>>> dynamic_assertion(response, ("address", "city"), "Boston")
False
>>> response["items"][2]["address"]["city"] = "Boston" # make response invalid
>>> dynamic_assertion(response, ("address", "city"), "New York")
False
>>> dynamic_assertion(response, ("address", "city"), "Boston")
False