How can I parse nested JSON to CSV - python

I have a new project where I obtain JSON data back from a REST API - I'm trying to parse this data to csv pipe delimited to import to our legacy software
I can't seem to get all the value pairs parsed properly - this is my first exposure to JSON and I've tried so many things but only getting a little right at a time
I have used Python and can get some items that I need but not the whole JSON tree - it comes across as a list and has some dictionaries and lists in it as well
I know my code is incomplete and just looking for someone to point me in the right direction on what tools in python can get the job done
import json
import csv
with open('tenants.json') as access_json:
read_content = json.load(access_json)
for rm_access in read_content:
rm_data = rm_access
print(rm_data)
contacts_data = rm_data['Contacts']
leases_data = rm_data['Leases']
udfs_data = rm_data['UserDefinedValues']
for contacts_access in contacts_data:
rm_contacts = contacts_access
UPDATED:
import pandas as pd
with open('tenants.json') as access_json:
read_content = json.load(access_json)
for rm_access in read_content:
rm_data = rm_access
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 150)
TenantID = []
TenantDisplayID = []
Name = []
FirstName = []
LastName = []
WebMessage = []
Comment = []
RentDueDay = []
RentPeriod = []
FirstContact = []
PropertyID = []
PostingStartDate = []
CreateDate = []
CreateUserID = []
UpdateDate = []
UpdateUserID = []
Contacts = []
for rm_access in read_content:
rm_data = rm_access
TenantID.append(rm_data["TenantID"])
TenantDisplayID.append(rm_data["TenantDisplayID"])
Name.append(rm_data["Name"])
FirstName.append(rm_data["FirstName"])
LastName.append(rm_data["LastName"])
WebMessage.append(rm_data["WebMessage"])
Comment.append(rm_data["Comment"])
RentDueDay.append(rm_data["RentDueDay"])
RentPeriod.append(rm_data["RentPeriod"])
# FirstContact.append(rm_data["FirstContact"])
PropertyID.append(rm_data["PropertyID"])
PostingStartDate.append(rm_data["PostingStartDate"])
CreateDate.append(rm_data["CreateDate"])
CreateUserID.append(rm_data["CreateUserID"])
UpdateUserID.append(rm_data["UpdateUserID"])
Contacts.append(rm_data["Contacts"])
df = pd.DataFrame({"TenantID":TenantID,"TenantDisplayID":TenantDisplayID, "Name"
: Name,"FirstName":FirstName, "LastName": LastName,"WebMessage": WebMessage,"Com
ment": Comment, "RentDueDay": RentDueDay, "RentPeriod": RentPeriod, "PropertyID"
: PropertyID, "PostingStartDate": PostingStartDate,"CreateDate": CreateDate, "Cr
eateUserID": CreateUserID,"UpdateUserID": UpdateUserID,"Contacts": Contacts})
print(df)
Here is sample of the file
[
{
"TenantID": 115,
"TenantDisplayID": 115,
"Name": "Jane Doe",
"FirstName": "Jane",
"LastName": "Doe",
"WebMessage": "",
"Comment": "",
"RentDueDay": 1,
"RentPeriod": "Monthly",
"FirstContact": "2015-11-01T15:30:00",
"PropertyID": 17,
"PostingStartDate": "2010-10-01T00:00:00",
"CreateDate": "2014-04-16T13:35:37",
"CreateUserID": 1,
"UpdateDate": "2017-03-22T11:31:48",
"UpdateUserID": 1,
"Contacts": [
{
"ContactID": 128,
"FirstName": "Jane",
"LastName": "Doe",
"MiddleName": "",
"IsPrimary": true,
"DateOfBirth": "1975-02-27T00:00:00",
"FederalTaxID": "111-11-1111",
"Comment": "",
"Email": "jane.doe#mail.com",
"License": "ZZT4532",
"Vehicle": "BMW 3 Series",
"IsShowOnBill": true,
"Employer": "REW",
"ApplicantType": "Applicant",
"CreateDate": "2014-04-16T13:35:37",
"CreateUserID": 1,
"UpdateDate": "2017-03-22T11:31:48",
"AnnualIncome": 0.0,
"UpdateUserID": 1,
"ParentID": 115,
"ParentType": "Tenant",
"PhoneNumbers": [
{
"PhoneNumberID": 286,
"PhoneNumberTypeID": 2,
"PhoneNumber": "703-555-5610",
"Extension": "",
"StrippedPhoneNumber": "7035555610",
"IsPrimary": true,
"ParentID": 128,
"ParentType": "Contact"
}
]
}
],
"UserDefinedValues": [
{
"UserDefinedValueID": 1,
"UserDefinedFieldID": 4,
"ParentID": 115,
"Name": "Emerg Contact Name",
"Value": "Terry Harper",
"UpdateDate": "2016-01-22T15:41:53",
"FieldType": "Text",
"UpdateUserID": 2,
"CreateUserID": 2
},
{
"UserDefinedValueID": 174,
"UserDefinedFieldID": 5,
"ParentID": 115,
"Name": "Emerg Contact Phone",
"Value": "703-555-3568",
"UpdateDate": "2016-01-22T15:42:03",
"FieldType": "Text",
"UpdateUserID": 2,
"CreateUserID": 2
}
],
"Leases": [
{
"LeaseID": 115,
"TenantID": 115,
"UnitID": 181,
"PropertyID": 17,
"MoveInDate": "2010-10-01T00:00:00",
"SortOrder": 1,
"CreateDate": "2014-04-16T13:35:37",
"UpdateDate": "2017-03-22T11:31:48",
"CreateUserID": 1,
"UpdateUserID": 1
}
],
"Addresses": [
{
"AddressID": 286,
"AddressTypeID": 1,
"Address": "14393 Montgomery Road Lot #102\r\nCincinnati, OH 45122",
"Street": "14393 Montgomery Road Lot #102",
"City": "Cincinnati",
"State": "OH",
"PostalCode": "45122",
"IsPrimary": true,
"ParentID": 115,
"ParentType": "Tenant"
}
],
"OpenReceivables": [],
"Status": "Current"
},
Not all tenants will have all elements which is also tricky
I need the data from the top where there is TenantID, TenantDisplayID, etc
I also need the data from the Contacts, PhoneNumbers, Leases, etc values
Each line should be static so if it doesn't have certain tags then I'd like a Null or None so it would look like
TentantID|TenantDisplayID|FirstName….etc so each line has same number of fields

Something like this should work:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100000)
TenantID = []
TenantDisplayID = []
Name = []
FirstName = []
LastName = []
WebMessage = []
Comment = []
RentDueDay = []
RentPeriod = []
FirstContact = []
PropertyID = []
PostingStartDate = []
CreateDate = []
CreateUserID = []
UpdateDate = []
UpdateUserID = []
Contacts = []
for rm_access in read_content:
rm_data = rm_access
print(rm_data)
TenantID.append(rm_data["TenantID"])
TenantDisplayID.append(rm_data["TenantDisplayID"])
Name.append(rm_data["Name"])
FirstName.append(rm_data["FirstName"])
LastName.append(rm_data["LastName"])
WebMessage.append(rm_data["WebMessage"])
Comment.append(rm_data["Comment"])
RentDueDay.append(rm_data["RentDueDay"])
RentPeriod.append(rm_data["RentPeriod"])
FirstContact.append(rm_data["FirstContact"])
PropertyID.append(rm_data["PropertyID"])
PostingStartDate.append(rm_data["PostingStartDate"])
CreateDate.append(rm_data["CreateDate"])
CreateUserID.append(rm_data["CreateUserID"])
UpdateUserID.append(rm_data["UpdateUserID"])
Contacts.append(rm_data["Contacts"])
df = pd.DataFrame({"TenantID":TenantID,"TenantDisplayID":TenantDisplayID, "Name": Name,
"FirstName":FirstName, "LastName": LastName,"WebMessage": WebMessage,
"Comment": Comment, "RentDueDay": RentDueDay, "RentPeriod": RentPeriod,
"FirstContact": FirstContact, "PropertyID": PropertyID, "PostingStartDate": PostingStartDate,
"CreateDate": CreateDate, "CreateUserID": CreateUserID,"UpdateUserID": UpdateUserID,
"Contacts": Contacts})
print(df)

The General Problem
The problem with this task (and other similar ones) is not just how to create an algorithm - I am sure you will theoretically be able to solve this with a (not so) nice amount of nested for-loops. The problem is to organise the code in a way that you don't get a headache - i.e. in a way that you can fix bugs easily, that you can write unittests, that you can understand the code easily from reading it (in six months from now) and that you can easily change your code in case you need to do so.
I do not know anybody who does not make mistakes when wrapping their head around a deeply nested structure. And chasing for bugs in a code which is heavily nested because it mirrors the nested structure of the data, can be quite frustrating.
The Quick (and most probably: Best) Solution
Rely on packages that are made for your exact usecase, such as
https://github.com/cwacek/python-jsonschema-objects
In case you have a formal definition of the API schema, you could use packages for that. If, for instance, your API has a Swagger schema definition, you cann use swagger-py (https://github.com/digium/swagger-py) to get your JSON response into Python objects.
The Principle Solution: Object Oriented Programming and Recursion
Even if there might be some libraries for your concrete use case, I would like to explain the principle of how to deal with "that kind" of tasks:
A good way to organise code for this kind of problem is using Object Oriented Programming. The nesting hassle can be laid out much clearer by making use of the principle of recursion. This also makes it easier to chabge the code, in case the JSON schema of your API response changes for any reasons (an update of the API, for instance). In your case I would suggest you create something like the following:
class JsonObject:
"""Parent Class for any Object that will be retrieved from the JSON
and potentially has nested JsonObjects inside.
This class takes care of parsing the json into python Objects and deals
with the recursion into the nested structures."""
primitives = []
json_objects = {
# For each class, this dict defines all the "embedded" classes which
# live directly "under" that class in the nested JSON. It will have the
# following structure:
# attribute_name : class
# In your case the JSON schema does not have any "single" objects
# in the nesting strcuture, but only lists of nested objects. I
# still , to demonstrate how you would do it in case, there would be
# single "embedded"
}
json_object_lists = {
# For each class, this dict defines all the "embedded" subclasses which
# are provided in a list "under" that class in the nested JSON.
# It will have the following structure:
# attribute_name : class
}
#classmethod
def from_dict(cls, d: dict) -> "JsonObject":
instance = cls()
for attribute in cls.primitives:
# Here we just parse all the primitives
instance.attribute = getattr(d, attribute, None)
for attribute, klass in cls.json_object_lists.items():
# Here we parse all lists of embedded JSON Objects
nested_objects = []
l = getattr(d, attribute, [])
for nested_dict in l:
nested_objects += klass.from_dict(nested_dict)
setattr(instance, attribute, nested_objects)
for attribute, klass in cls.json_objects.items():
# Here we parse all "single" embedded JSON Objects
setattr(
instance,
attribute,
klass.from_dict(getattr(d, attribute, None)
)
def to_csv(self) -> str:
pass
Since you didn't explain how exactly you want to create a csv from the JSON, I didn't implement that method and left this to you. It is also not necessary to explain the overall approach.
Now we have the general Parent class all our specific will inherit from, so that we can apply recursion to our problem. Now we only need to define these concrete structures, according to the JSON schema we want to parse. I got the following from your sample, but you can easily change the things you need to:
class Address(JsonObject):
primitives = [
"AddressID",
"AddressTypeID",
"Address",
"Street",
"City",
"State",
"PostalCode",
"IsPrimary",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {}
class Lease(JsonObject):
primitives = [
"LeaseID",
"TenantID",
"UnitID",
"PropertyID",
"MoveInDate",
"SortOrder",
"CreateDate",
"UpdateDate",
"CreateUserID",
"UpdateUserID",
]
json_objects = {}
json_object_lists = {}
class UserDefinedValue(JsonObject):
primitives = [
"UserDefinedValueID",
"UserDefinedFieldID",
"ParentID",
"Name",
"Value",
"UpdateDate",
"FieldType",
"UpdateUserID",
"CreateUserID",
]
json_objects = {}
json_object_lists = {}
class PhoneNumber(JsonObject):
primitives = [
"PhoneNumberID",
"PhoneNumberTypeID",
"PhoneNumber",
"Extension",
"StrippedPhoneNumber",
"IsPrimary",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {}
class Contact(JsonObject):
primitives = [
"ContactID",
"FirstName",
"LastName",
"MiddleName",
"IsPrimary",
"DateOfBirth",
"FederalTaxID",
"Comment",
"Email",
"License",
"Vehicle",
"IsShowOnBill",
"Employer",
"ApplicantType",
"CreateDate",
"CreateUserID",
"UpdateDate",
"AnnualIncome",
"UpdateUserID",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {
"PhoneNumbers": PhoneNumber,
}
class Tenant(JsonObject):
primitives = [
"TenantID",
"TenantDisplayID",
"Name",
"FirstName",
"LastName",
"WebMessage",
"Comment",
"RentDueDay",
"RentPeriod",
"FirstContact",
"PropertyID",
"PostingStartDate",
"CreateDate",
"CreateUserID",
"UpdateDate",
"UpdateUserID",
"OpenReceivables", # Maybe this is also a nested Object? Not clear from your sample.
"Status",
]
json_object_lists = {
"Contacts": Contact,
"UserDefinedValues": UserDefinedValue,
"Leases": Lease,
"Addresses": Address,
}
json_objects = {}
You might imagine the "beauty" (at least: order) of that approach, which lies in the following: With this structure, we could tackle any level of nesting in the JSON response of your API without additional headache - our code would not deepen its indentation level, because we have separated the nasty nesting into the recursive definition of JsonObjects from_json method. That is why it is much easier now to identify bugs or apply changes to our code.
To finally parse the JSON now into our Objects, you would do something like the following:
import typing
import json
def tenants_from_json(json_string: str) -> typing.Iterable["Tenant"]:
tenants = [
Tenant.from_dict(tenant_dict)
for tenant_dict in json.loads(json_string)
]
return tenants
Important Final Side Note: This is just the basic Principle
My code example is just a very brief introduction into the idea of using objects and recursion to deal with an overwhelming (and nasty) nesting of a structure. The code has some flaws. For instance one should avoid define mutable class variables. And of course the whole code should validate the data it gets from the API. You also might want to add the type of each attribute and represent that correctly in the Python objects (Your sample has integers, datetimes and strings, for instance).
I really only wanted to show you the very principle of Object Oriented Programming here.
I didn't take the time to test my code. So there are probably bugs left. Again, I just wanted to demonstrate the principle.

Related

Efficiently setting and deleting array items with Redis JSON

I'm using Redis OM for Python and my models look like below:
from typing import List
from pydantic import BaseModel
from redis_om import EmbeddedJsonModel, Field, JsonModel, Migrator
class FeedItem(EmbeddedJsonModel):
id: str = Field(index=True)
s_score: str = Field()
i_score: str = Field()
factors: List[str]
class Feed(JsonModel):
user_id: str = Field(index=True, primary_key=True)
feed_items: List[FeedItem] = Field(default=[])
which will then result in a data structure like this:
{
"user_id": "john_001",
"feed_items": [
{
"pk": "01GS2N47G8WK2831GNHMGRVDJT",
"id": "63e8c53825e41aca93229eac",
"s_score": "0.5082375478927202",
"i_score": "0.04626620037029417",
"factors": ["2nd"],
},
{
"pk": "01GS2N557FTV0SCK5TP2KENVAF",
"id": "63e8c5d31e033af45abfb64d",
"s_score": "0.7",
"i_score": "0.37718576424604",
"factors": ["2nd", "computer", "laptop"],
},
{
"pk": "01GS2N63S6VM1HZ6RJVH6M1XQJ",
"id": "63e8c743414c482153e332e6",
"s_score": "0.5082375478927202",
"i_score": "0.24141123225673727",
"factors": ["2nd", "thumbdrive", "portables"],
},
],
}
This is going to be a feed of a user and if he has viewed the first item (with "pk": "01GS2N47G8WK2831GNHMGRVDJT"), we will need to delete this item from his feed.
Currently, what I am having to do is to find the key with "user_id": "john_001", retrieve the feed_items to a Python list and remove the item with that pk, then reassign the feed_items and save the item. It's as the following:
feed = Feed.find(Feed.user_id = "john_001").first()
feed_items = feed.feed_items
new_feed_items = [i in feed_items if i["pk"] != "01GS2N47G8WK2831GNHMGRVDJT"]
feed.feed_items = new_feed_items
feed.save()
Is there any better way to do this? Because right now the process is taking quite long to complete (we have dozens of thousands of users' feed and there are several deletion processes like this every seconds.

How to parse the json data using python?

I've been messing around with JSON for some time. I want to get the values of "box" and "text" in this format using python can someone help me out how to resolve this example output:[92,197,162,215,AUTHORS,...!]
{ "form": [ { "box": [ 92,162,197,215], "text": "AUTHORS", "label": "question", "words": [ { "box": [ 92,197,162,215 ],"text": "AUTHORS"} ], "linking": [[0,13]],"id": 0 },
import os
import json
# Directory name consisting of json
file = open('033.json')
data = json.load(file)
result = []
for value in data['form']:
my_dict=[]
my_dict=value.get('box')
print(my_dict)
result.append(my_dict)
Probably like this:
collector = []
for obj in form:
collector.append({"box": obj["box"], "text": obj["text"]})
print(collector)
Okay, few issues with your code -
Why is your list named my_dict? A name should indicate what the object is/ what it contains. Your name does the opposite and if someone works with that code in the future then it will most likely confuse them.
Why are you initializing a list before doing this value.get('box')?
As for the solution, it is a short piece of code that would require 2 lines of code.
result = []
for form_dict in data['form']:
result.append(tuple(form_dict[key]
for key in ('box', 'text') if key in form_dict))
That piece of code would result in this: [([92, 162, 197, 215], 'AUTHORS')] based on the data you provided.
This is assuming that there can be more items in the data['form'] list, otherwise the for loop is not needed.

The best way to transform a response to a json format in the example

Appreciate if you could help me for the best way to transform a result into json as below.
We have a result like below, where we are getting an information on the employees and the companies. In the result, somehow, we are getting a enum like T, but not for all the properties.
[ {
"T.id":"Employee_11",
"T.category":"Employee",
"node_id":["11"]
},
{
"T.id":"Company_12",
"T.category":"Company",
"node_id":["12"],
"employeecount":800
},
{
"T.id":"id~Employee_11_to_Company_12",
"T.category":"WorksIn",
},
{
"T.id":"Employee_13",
"T.category":"Employee",
"node_id":["13"]
},
{
"T.id":"Parent_Company_14",
"T.category":"ParentCompany",
"node_id":["14"],
"employeecount":900,
"childcompany":"Company_12"
},
{
"T.id":"id~Employee_13_to_Parent_Company_14",
"T.category":"Contractorin",
}]
We need to transform this result into a different structure and grouping based on the category, if category in Employee, Company and ParentCompany, then it should be under the node_properties object, else, should be in the edge_properties. And also, apart from the common properties(property_id, property_category and node), different properties to be added if the category is company and parent company. There are few more logic also where we have to get the from and to properties of the edge object based on the 'to' . the expected response is,
"node_properties":[
{
"property_id":"Employee_11",
"property_category":"Employee",
"node":{node_id: "11"}
},
{
"property_id":"Company_12",
"property_category":"Company",
"node":{node_id: "12"},
"employeecount":800
},
{
"property_id":"Employee_13",
"property_category":"Employee",
"node":{node_id: "13"}
},
{
"property_id":"Company_14",
"property_category":"ParentCompany",
"node":{node_id: "14"},
"employeecount":900,
"childcompany":"Company_12"
}
],
"edge_properties":[
{
"from":"Employee_11",
"to":"Company_12",
"property_id":"Employee_11_to_Company_12",
},
{
"from":"Employee_13",
"to":"Parent_Company_14",
"property_id":"Employee_13_to_Parent_Company_14",
}
]
In java, we have used the enhanced for loop, switch etc. How we can write the code in the python to get the structure as above from the initial result structure. ( I am new to python), thank you in advance.
Regards
Here is a method that I quickly made, you can adjust it to your requirements. You can use regex or your own function to get the IDs of the edge_properties then assign it to an object like the way I did. I am not so sure of your full requirements but if that list that you gave is all the categories then this will be sufficient.
def transform(input_list):
node_properties = []
edge_properties = []
for input_obj in input_list:
# print(obj)
new_obj = {}
if input_obj['T.category'] == 'Employee' or input_obj['T.category'] == 'Company' or input_obj['T.category'] == 'ParentCompany':
new_obj['property_id'] = input_obj['T.id']
new_obj['property_category'] = input_obj['T.category']
new_obj['node'] = {input_obj['node_id'][0]}
if "employeecount" in input_obj:
new_obj['employeecount'] = input_obj['employeecount']
if "childcompany" in input_obj:
new_obj['childcompany'] = input_obj['childcompany']
node_properties.append(new_obj)
else: # You can do elif == to as well based on your requirements if there are other outliers
# You can use regex or whichever method here to split the string and add the values like above
edge_properties.append(new_obj)
return [node_properties, edge_properties]

creating multiple python class instance from json

I am writing some tests to evaluate a rest service
my response is
[
{
"Title_Id": 1,
"Title": "Mr",
"TitleDescription": "Mr",
"TitleGender": "Male",
"Update_Date": "2012-07-21T18:43:04"
},
{
"Title_Id": 2,
"Title": "Mrs",
"TitleDescription": "Mrs",
"TitleGender": "Female",
"Update_Date": "2012-07-21T18:42:59"
},
{
"Title_Id": 3,
"Title": "Sir",
"TitleDescription": "Sir",
"TitleGender": "Male",
"Update_Date": null
}
]
and need to create multiple instance of the class
class TitleInfo:
def __init__(self, Title_Id, Title, TitleDescription, TitleGender, Update_Date ):
self.Title_Id = Title_Id
self.Title = Title
self.TitleDescription = TitleDescription
self.TitleGender = TitleGender
self.Update_Date = Update_Date
what I have done is
def GetTitle(self):
try:
response = *#......"The string shown above"*
if isinstance(response, str) :
Records = json.loads(response)
RecTitles = []
for num in range(0, len(Records)):
RecTitle =TitleInfo(Records[num]['Title_Id'],Records[num]['Title'],Records[num]['TitleDescription'],Records[num]['TitleGender'],Records[num]['Update_Date'])
RecTitles.append(RecTitle)
This is working fine ....I need to know is there more short and sweet way to do that?
You could just unpack each dict and give that as an argument to TitleInfo:
RecTitles = [TitleInfo(**x) for x in json.loads(response)]
Here's the explanation from Python tutorial:
In the same fashion, dictionaries can deliver keyword arguments with the **-operator:
>>> def parrot(voltage, state='a stiff', action='voom'):
... print("-- This parrot wouldn't", action, end=' ')
... print("if you put", voltage, "volts through it.", end=' ')
... print("E's", state, "!")
...
>>> d = {"voltage": "four million", "state": "bleedin' demised", "action": "VOOM"}
>>> parrot(**d)
-- This parrot wouldn't VOOM if you put four million volts through it. E's bleedin' demised !
As an aside, you generally want to avoid hand-coding validation code. Checkout an API documentation framework: swagger, RAML, API Blueprint. All of them have tooling for request/response validation.
The next step would be to use a testing framework like dredd.

Parsing file into Parent/ Child format for a JSON file

I would like some help/ advice on how to parse this file for Gene ontology (.obo)
I am working to create a visualisation in D3, and need to create a "tree" file, in the JSON format -
{
"name": "flare",
"description": "flare",
"children": [
{
"name": "analytic",
"description": "analytics",
"children": [
{
"name": "cluster",
"description": "cluster",
"children": [
{"name": "Agglomer", "description": "AgglomerativeCluster", "size": 3938},
{"name": "Communit", "description": "CommunityStructure", "size": 3812},
{"name": "Hierarch", "description": "HierarchicalCluster", "size": 6714},
{"name": "MergeEdg", "description": "MergeEdge", "size": 743}
]
}, etc..
This format seems fairly easy to replicate in a dictionary in python, with 3 fields for each entry: name, description, and children[].
My probelm here is actually HOW to extract the data. The file linked above has "objects" structured as:
[Term]
id: GO:0000001
name: mitochondrion inheritance
namespace: biological_process
def: "The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton." [GOC:mcc, PMID:10873824, PMID:11389764]
synonym: "mitochondrial inheritance" EXACT []
is_a: GO:0048308 ! organelle inheritance
is_a: GO:0048311 ! mitochondrion distribution
Where I will need the id, is_a and name fields. I have tried using python to parse this, but I cant seem to find a way to locate each object.
Any ideas?
Here's a fairly simple way to parse the objects in your '.obo' file. It saves the object data into a dict with the id as the key and the name and is_a data saved in a list. Then it pretty-prints it using the standard json module's .dumps function.
For testing purposes, I used a truncated version of the file in your link that only includes up to id: GO:0000006.
This code ignores any objects that contain the is_obsolete field. It also removes the description info from the is_a fields; I figured you probably wanted that, but it's easy enough to disable that functionality.
#!/usr/bin/env python
''' Parse object data from a .obo file
From http://stackoverflow.com/q/32989776/4014959
Written by PM 2Ring 2015.10.07
'''
from __future__ import print_function, division
import json
from collections import defaultdict
fname = "go-basic.obo"
term_head = "[Term]"
#Keep the desired object data here
all_objects = {}
def add_object(d):
#print(json.dumps(d, indent = 4) + '\n')
#Ignore obsolete objects
if "is_obsolete" in d:
return
#Gather desired data into a single list,
# and store it in the main all_objects dict
key = d["id"][0]
is_a = d["is_a"]
#Remove the next line if you want to keep the is_a description info
is_a = [s.partition(' ! ')[0] for s in is_a]
all_objects[key] = d["name"] + is_a
#A temporary dict to hold object data
current = defaultdict(list)
with open(fname) as f:
#Skip header data
for line in f:
if line.rstrip() == term_head:
break
for line in f:
line = line.rstrip()
if not line:
#ignore blank lines
continue
if line == term_head:
#end of term
add_object(current)
current = defaultdict(list)
else:
#accumulate object data into current
key, _, val = line.partition(": ")
current[key].append(val)
if current:
add_object(current)
print("\nall_objects =")
print(json.dumps(all_objects, indent = 4, sort_keys=True))
output
all_objects =
{
"GO:0000001": [
"mitochondrion inheritance",
"GO:0048308",
"GO:0048311"
],
"GO:0000002": [
"mitochondrial genome maintenance",
"GO:0007005"
],
"GO:0000003": [
"reproduction",
"GO:0008150"
],
"GO:0000006": [
"high-affinity zinc uptake transmembrane transporter activity",
"GO:0005385"
]
}

Categories

Resources