Counting multiple json inputs python - python

I get an input like this:
input 1:
{
"name": "Ben",
"description": "Ben",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
}
input 2
{
"name": "Ice",
"description": "Ice",
"attributes": [
{
"type": "Background",
"value": "Green"
},
{
"type": "Hair-color",
"value": "White"
}
]
}
input 3
{
"name": "Itay",
"description": "Itay",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
}
What I want to do is count the amount of each type of background and each type of hair-color appearing.
(These are sample examples and in reality there are more types and different values)
Let's say in these examples we have 2 objects that have a background as default then I want to have a count of that like so:
Backround default count=2
hair-color brown = 2
background green = 1
hair-color white = 1
I want the most effective code because there are other aspects to the code, in addition it will run on thousands of queries not just two, so needs to run in good times too :D
My code so far:
import requests
import json
from collections import Counter
from collections import defaultdict
from time import sleep
attributes = []
test_dict = defaultdict(list)
for i in range(min_id, max_id+1):
api = 'api/v1/test/{}'.format(i)
response = requests.get(api)
item_dict = json.loads(response.text)
for item in item_dict['attributes']:
attributes.append(item["trait_type"]) if item["trait_type"] not in attributes else attributes
test_dict[item["trait_type"]].append(item["value"])
sleep(0.02)
for attribute in attributes:
print(attribute)
print(Counter(test_dict[attribute]))

This should work for you:
def constract_data(data_dict):
output = []
total_count = 0
for data in data_dict:
attributes = data["attributes"]
for attribute in attributes:
total_count += 1
dict_key = attribute["type"].lower()
dict_value = attribute["value"].lower()
dict_index = [index for index, data in enumerate(output) if data.get(dict_key, "") == dict_value]
if dict_index:
output[dict_index[0]]['count'] += 1
else:
atb_dict = {dict_key: dict_value, 'count': 1}
output.append(atb_dict)
return output, total_count
def calculate_occurrence_ratio(data_dict, total_count):
for index, data in enumerate(data_dict):
count = data.get('count', 0)
ratio = round(((count / total_count) * 100), 2)
data['ratio'] = f'{ratio}%'
return data_dict
data_dict = [
{
"name":"Ice",
"description":"Ice",
"attributes":[
{
"type":"Background",
"value":"Green"
},
{
"type":"Hair-color",
"value":"White"
}
]
},
{
"name":"Ben",
"description":"Ben",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
},
{
"name":"Itay",
"description":"Itay",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
}
]
output_data, total_count = constract_data(data_dict)
output_data = calculate_occurrence_ratio(output_data, total_count)
print(output_data)
Output:
[{'background': 'green', 'count': 1, 'ratio': '16.67%'}, {'hair-color': 'white', 'count': 1, 'ratio': '16.67%'}, {'background': 'default', 'count': 2, 'ratio': '33.33%'}, {'hair-color': 'brown', 'count': 2, 'ratio': '33.33%'}]

this solution will work for you:
list_data = [
{
"name": "Ice",
"description": "Ice",
"attributes": [
{
"type": "Background",
"value": "Green"
},
{
"type": "Hair-color",
"value": "White"
},
{
"type": "other",
"value": "White"
}
]
},
{
"name": "Ben",
"description": "Ben",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
},{
"name": "Itay",
"description": "Itay",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
},
]
output = {}
all_count = {}
for user in list_data:
data = user["attributes"]
for dat in data:
typeu = dat["type"]
if typeu not in all_count:
all_count[typeu]=1
else:
all_count[typeu]+=1
for user in list_data:
data = user["attributes"]
for dat in data:
typeu = dat["type"]
if typeu not in output:
output[typeu]={}
if dat["value"] not in output[typeu]:
output[typeu][dat["value"]] = "1 with: {}%".format(int(1/all_count[typeu]*100))
else:
count = int(output[typeu][dat["value"]][0])+1
output[typeu][dat["value"]] = str(count)+" with: {}%".format(int(count/all_count[typeu]*100))
print(output)

Here is another approach:
from collections import defaultdict
data_dict = [
{
"name":"Ice",
"description":"Ice",
"attributes":[
{
"type":"Background",
"value":"Green"
},
{
"type":"Hair-color",
"value":"White"
}
]
},
{
"name":"Ben",
"description":"Ben",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
},
{
"name":"Itay",
"description":"Itay",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
}
]
out = defaultdict(lambda: defaultdict(int))
tot_count = defaultdict(int)
for data in data_dict:
for attri in data['attributes']:
tot_count[attri['type']]+=1
out[attri['type']][attri['value']]+=1
for k, v in out.items():
for k1, v1 in v.items():
print (f'{k.lower()} {k1} count={v1} ratio={int(v1*100/tot_count[k])}%')
Output:
background Green count=1 ratio=33%
background Default count=2 ratio=66%
hair-color White count=1 ratio=33%
hair-color Brown count=2 ratio=66%

Related

Writing resilient recursive code that will return results from a big json file

I have written a recursive code. I want more experienced people to tell me how resillient and fail-safe is my code:
I have a json file (Json file can be as big as 300MB):
[
{
"modules": {
"webpages": []
},
"webpages": {
"ip_addr": {
"value": "127.0.0.1",
"tags": []
},
"http": {
"status": {
"value": "Unavailable",
"tags": []
},
"title": {
"value": "403 Forbidden",
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
"server": {
"value": "Apache",
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
},
"redirects": [],
"robottxt": null
}
},
{
"modules": {
"webpages": []
}
}
]
I want to return value keys where tags are populated.
So I want to ignore:
"status": {
"value": "Unavailable",
"tags": []
},
But I want to return the title and server values. I also want to return ip_addr.value
I have written this code:
def getAllValues(nestedDictionary, firstArray, firstObj, firstUseful):
returnedArray = firstArray
tempValue = firstObj
useful = firstUseful
for key, value in nestedDictionary.items():
ipString = nestedDictionary.get("ip_addr")
if ipString is not None:
ipValue = ipString.get("value")
useful = {"ip_add": ipValue}
if isinstance(value, dict):
temp = {
"Key": key,
"useful": useful,
}
getAllValues(value, returnedArray, temp, useful)
else:
if key == "value":
tempValue["value"] = value
if key == "tags" and isinstance(value, list) and len(value) > 0:
tempValue["tags"] = value
returnedArray.append(tempValue)
return returnedArray
The above code should return:
[
{
"Key": "title",
"value": "403 Forbidden",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
{
"Key": "server",
"value": "Apache",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
]
Its a long post, but hopefully, someone can give me some assurance :)

Transforming streaming data into a json

I have a streaming data coming from a source and I was able to capture few important variables like ID, Timestamp and Vital signs. What I am trying to do is create a json file object and unload it into a file system. This is one of the examples:
for k in streaming:
id = k[1] ----------> 1
timestamp = k[2] ------> 1652304692
vsigns = k[3] -------> Nested dictionary
This is how vsigns looks like
"vsigns":
{
"ECG":
{
"VPB_FREQ": "0",
"STI": "+0.00"
},
"HR":
{
"HR_EKG": "87",
"HR_PPG_1": "87",
"HR_PULSED": "87"
},
"NIBP":
{
"NIBPS": "119",
"NIBPD": "88",
"NIBPM": "95"
}
}
And I want a json structure in the following format:
[{
"id": "1",
"ECG":
{
"timestamp": 1652304692,
"VPB_FREQ": "0",
"STI": "+0.00",
}
},
{
"id": "1",
"HR":
{
"timestamp": 1652304692,
"HR_EKG": "87",
"HR_PPG_1": "87",
"HR_PULSED": "87"
}
},
{
"id": "1",
"NIBP":
{
"timestamp": 1652304692,
"NIBPS": "119",
"NIBPD": "88",
"NIBPM": "95"
},
}]
I tried an approach but doesn't give me what I want. How do I get this right.
for k in streaming:
id = k[1]
timestamp = k[2]
vsigns = k[3]
vlist = []
for k, v in vsigns.items():
vdict = {"id": id,
k:{"timestamp":timestamp,
"vsigns": v}}
vlist.append(vdict)
print(vlist)
Output:
[{
"id": "1",
"ECG":
{
"timestamp": 1652951054.0,
"vsigns":
{
"VPB_FREQ": "0"
}
}
},
{
"id": "1",
"HR":
{
"timestamp": 1652951054.0,
"vsigns":
{
"HR_EKG": "126",
"HR_PPG_1": "127",
"HR_PULSED": "127"
}
}
},
{
"id": "1",
"NIBP":
{
"timestamp": 1652951054.0,
"vsigns":
{
"NIBPS": "95",
"NIBPD": "46",
"NIBPM": "62"
}
}
}
}]
The following piece of code worked for me:
for k in streaming:
id = k[1]
timestamp = k[2]
vsigns = k[3]
vlist = []
for k, v in vsigns.items():
v['timestamp'] = epoch_time
vdict = {"id": id,
k:v}
vlist.append(vdict)

duplicates in a JSON file based on two attributes

I have a JSON file and that is a nested JSON. I would like to remove duplicates based on two keys.
JSON example:
"books": [
{
"id": "1",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
},
{
"name": "Peter",
"main": 0
}
]
}
]
},
{
"id": "2",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "Jeroge",
"main": 1
},
{
"name": "Peter",
"main": 0
},
{
"name": "John",
"main": 0
}
]
}
]
},
{
"id": "3",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
}
]
}
]
}
]
Here I try to match the title and author name. For example, for id 1 and id 2 are duplicates( as the title is same and author names are also same(the author sequence doesn't matter and no need to consider the main attributes). So, in the output JSON only id:1 or id:2 will remain with id:3. In the final output I need two file.
Output_JSON:
"books": [
{
"id": "1",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
},
{
"name": "Peter",
"main": 0
}
]
}
]
},
{
"id": "3",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
}
]
}
]
}
]
duplicatedID.csv:
1-2
The following method I tried but it is not giving correct results:
list= []
duplicate_Id = []
for data in (json_data['books'])[:]:
elements= []
id = data['id']
title = data['story']['title']
elements.append(title)
for i in (data['description'][0]['author']):
name = (i['name'])
elements.append(name)
if not list:
list.append(elements)
else:
for j in list:
if set(elements) == set(j):
duplicate_Id.append(id)
elements = []
The general idea is to:
Get the groups identified by some function that collects duplicates.
Then return the first entry of each group, ensuring no duplicates.
Define the key function as the sorted list of authors and. As the list of authors is by definition the unique key, but may appear in any order.
import json
from itertools import groupby
j = json.load(books)
def transform(books):
groups = [list(group) for _, group in groupby(books, key=getAuthors)]
return [group[0] for group in groups]
def getAuthors(book):
authors = book['description'][0]['author']
return sorted([author['name'] for author in authors])
print(transform(j['books']))
If we wanted to get the duplicates, then we do the same computation, but return any sublist with length > 1 as this is by our definition duplicated data.
def transform(books):
groups = [list(group) for _, group in groupby(books, key=getAuthors)]
return [group for group in groups if len(group) > 1]
Where j['books'] is the JSON you gave enclosed in an object.

Python dictionary from schema

I need to check if all the keys declared in a schema file are present in a dictionary and if they are not, I need to fill them with a default value, of a given type. I need to do this dynamically because the structure can be even more complicated than the one below.
{
"type": "object",
"properties": {
"vid": {
"type": ["null", "integer"]
},
"merged-vids": {
"type": ["null", "array"],
"items": {
"type": ["null", "integer"]
}
},
"portal-id": {
"type": ["null", "integer"]
},
"is-contact": {
"type": ["null", "boolean"]
}
"form-submissions": {
"type": ["null", "array"],
"items": {
"type": ["null", "object"],
"properties": {
"conversion-id": {
"type": ["null", "string"]
},
"timestamp": {
"type": ["null", "string"],
"format": "date-time"
},
"form-id": {
"type": ["null", "string"]
},
"portal-id": {
"type": ["null", "integer"]
},
"page-url": {
"type": ["null", "string"]
},
"title": {
"type": ["null", "string"]
}
}
}
}
}
}
This is an example of dictionary:
{
"vid": 1000,
"portal-id": 2512,
"is-contact": true,
"profile-token": "dummy_profile_token",
"profile-url": "dummy_profile_url",
"form-submissions": [
{
"conversion-id": "127-798",
"timestamp": 1484080167266,
"form-id": "70fd-4b98-14796-777",
"page-url": "https://example.com/landing-page-url",
"title": "A new test form",
"meta-data": []
}
]
}
I am also new to python and this is a bit too much.
This is what I tried, but I cannot figure out what to do.
def get_default(type_object):
if type_object == 'object':
new_dict = {}
elif type_object == 'array':
return []
else:
return ''
def fill_fields_with_empty_str(record, schema):
if isinstance(schema['type'], list):
type_obj = schema['type'][len(schema['type'])-1]
elif isinstance(schema['type'], str):
type_obj = schema['type']
if type_obj == 'object':
new_dict = {}
for key in schema['properties'].keys():
if not record.get(key):
record[key] = get_default(schema['properties'][key]['type'])
new_dict[key] = fill_fields_with_empty_str(record[key], schema['properties'][key])
return new_dict
elif type_obj == 'array':
new_list = []
type_obj = schema["items"]['type']
if len(record) == 0:
record = get_default(type_obj)
for element in schema["items"]:
new_list.append(fill_fields_with_empty_str(record, schema['items']))
return new_list
else:
return ''

manipulating json in python using recursion

All,
I am trying to change the way some json looks by going through and formatting it in the following way:
1. flatten all of the fields lists
2. Then remove the fields lists and replace them with the name : flatten list
Example:
{
"name": "",
"fields": [{
"name": "keys",
"fields": [{
"node-name": "0/0/CPU0"
},
{
"interface-name": "TenGigE0/0/0/47"
},
{
"device-id": "ASR9K-H1902.corp.cisco.com"
}
]
},
{
"name": "content",
"fields": [{
"name": "lldp-neighbor",
"fields": [{
"receiving-interface-name": "TenGigE0/0/0/47"
},
{
"receiving-parent-interface-name": "Bundle-Ether403"
},
{
"device-id": "ASR9K-H1902.corp.cisco.com"
},
{
"chassis-id": "78ba.f975.a64f"
},
{
"port-id-detail": "Te0/1/0/4/0"
},
{
"header-version": 0
},
{
"hold-time": 120
},
{
"enabled-capabilities": "R"
},
{
"platform": ""
}
]
}]
}
]
}
Would turn into:
{
"": [{
"keys": [{
"node-name": "0/0/CPU0",
"interface-name": "TenGigE0/0/0/47",
"device-id": "ASR9K-H1902.corp.cisco.com"
}]
},
{
"content": [{
"lldp-neighbor": [{
"receiving-interface-name": "TenGigE0/0/0/47",
"receiving-parent-interface-name": "Bundle-Ether403",
"device-id": "ASR9K-H1902.corp.cisco.com",
"chassis-id": "78ba.f975.a64f",
"port-id-detail": "Te0/1/0/4/0",
"header-version": 0,
"hold-time": 120,
"enabled-capabilities": "R",
"platform": ""
}]
}]
}
]
}
I have tried the following to get the list flattened:
def _flatten_fields(self, fields_list):
c = {}
for b in [d for d in fields_list if bool(d)]:
c.update(b)
return c
This seems to work but I can't figure out a way to get into the sub levels using recursion, I am saving all flatten lists and names into a new dictionary, is there a way to do it by just manipulating the original dictionary?
This worked on the example you provided:
import json
def flatten(data):
result = dict()
if isinstance(data, dict):
if 'name' in data:
name = data['name']
result[name] = flatten(data['fields'])
else:
key = data.keys()[0]
value = data.values()[0]
result[key] = value
else:
for entry in data:
result.update(flatten(entry))
return result
print json.dumps(flatten(data), indent=4)
Output
{
"": {
"keys": {
"node-name": "0/0/CPU0",
"interface-name": "TenGigE0/0/0/47",
"device-id": "ASR9K-H1902.corp.cisco.com"
},
"content": {
"lldp-neighbor": {
"receiving-interface-name": "TenGigE0/0/0/47",
"receiving-parent-interface-name": "Bundle-Ether403",
"header-version": 0,
"port-id-detail": "Te0/1/0/4/0",
"chassis-id": "78ba.f975.a64f",
"platform": "",
"device-id": "ASR9K-H1902.corp.cisco.com",
"hold-time": 120,
"enabled-capabilities": "R"
}
}
}
}
It doesn't have the extra list layers shown in your expected output, but I don't think you want those.
This worked on the example you provided:
def flatten_fields(fields_list):
c = {}
for item in fields_list:
for key in item:
if key == "fields":
c[item["name"]] = flatten_fields(item["fields"])
elif key != "name":
c[key] = item[key]
break
return [c]
But it works on a list of dictionaries, so you should call it like flatten_fields([data])[0].
The output is:
{
"": [{
"keys": [{
"node-name": "0/0/CP0",
"interface-name": "TenGigE0/0/0/47",
"device-id": "ASR9K-H1902.corp.cisco.com"
}],
"content": [{
"lldp-neighbor": [{
"chassis-id": "78ba.f975.a64f",
"receiving-parent-interface-name": "Bndle-Ether403",
"enabled-capabilities": "R",
"device-id": "ASR9K-H1902.corp.cisco.com",
"hold-time": 120,
"receiving-interface-name": "TenGigE0/0/0/47",
"platform": "",
"header-version": 0,
"port-id-detail": "Te0/1/0/4/0"
}]
}]
}]
}

Categories

Resources