Json comparison with different link among objects in python - python

I'm trying to compare 2 json that show a different order in the ids value, even thouh they are concettually the same, as you can see from the example.
For instance, each person object points to an Address object based on its id. The ids could be different but, concettually, the 2 json are the same. Here an example: The order of the address_id is different (as the address object position), but the 2 json are the same, (John lives in NYC and Pippo in BCN).
{
"Persons": [
{
"Name": "John",
"Address_Id": "0"
},
{
"Name": "Pippo",
"Address_Id": "1"
}
],
"Addresses": [
{
"City": "NYC"
},
{
"City": "BCN"
}
]
}
{
"Persons": [
{
"Name": "John",
"Address_Id": "1"
},
{
"Name": "Pippo",
"Address_Id": "0"
}
],
"Addresses": [
{
"City": "BCN"
},
{
"City": "NYC"
}
]
}
Is there a way to compare the 2 json considering this "special case" ??
Thanks.

You can deserialise the json yourself by creating the equivalent Python classes and implementing the comparison yourself
. For example:
import json
class Person_book:
def __init__(self, person_list):
self.person_list = person_list
def __eq__(self, other):
if len(self.person_list) != len(other.person_list):
return False
for person in self.person_list:
if person not in other.person_list:
return False
return True
class Person:
def __init__(self, person_id, address):
self.person_id = person_id
self.address = address
def __str__(self):
return str(self.person_id) + ' ' + str(self.address )
def __eq__(self, other):
if self.person_id == other.person_id and self.address == other.address:
return True
return False
def deserialize_persons(persons_array):
persons = persons_array['Persons']
addresses = persons_array['Addresses']
person_list = []
for person in persons:
person_entry = Person(person['Name'], addresses[int(person['Address_Id'])])
person_list.append(person_entry)
return Person_book(person_list)
json_resp_1 = """{
"Persons": [
{
"Name": "John",
"Address_Id": "0"
},
{
"Name": "Pippo",
"Address_Id": "1"
}
],
"Addresses": [
{
"City": "NYC"
},
{
"City": "BCN"
}
]
}"""
json_resp_2 = """
{
"Persons": [
{
"Name": "John",
"Address_Id": "1"
},
{
"Name": "Pippo",
"Address_Id": "0"
}
],
"Addresses": [
{
"City": "BCN"
},
{
"City": "NYC"
}
]
}"""
resp_1 = json.loads(json_resp_1)
resp_2 = json.loads(json_resp_2)
person_book_1 = deserialize_persons(resp_1)
person_book_2 = deserialize_persons(resp_2)
print(person_book_1 == person_book_2)

Related

Writing resilient recursive code that will return results from a big json file

I have written a recursive code. I want more experienced people to tell me how resillient and fail-safe is my code:
I have a json file (Json file can be as big as 300MB):
[
{
"modules": {
"webpages": []
},
"webpages": {
"ip_addr": {
"value": "127.0.0.1",
"tags": []
},
"http": {
"status": {
"value": "Unavailable",
"tags": []
},
"title": {
"value": "403 Forbidden",
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
"server": {
"value": "Apache",
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
},
"redirects": [],
"robottxt": null
}
},
{
"modules": {
"webpages": []
}
}
]
I want to return value keys where tags are populated.
So I want to ignore:
"status": {
"value": "Unavailable",
"tags": []
},
But I want to return the title and server values. I also want to return ip_addr.value
I have written this code:
def getAllValues(nestedDictionary, firstArray, firstObj, firstUseful):
returnedArray = firstArray
tempValue = firstObj
useful = firstUseful
for key, value in nestedDictionary.items():
ipString = nestedDictionary.get("ip_addr")
if ipString is not None:
ipValue = ipString.get("value")
useful = {"ip_add": ipValue}
if isinstance(value, dict):
temp = {
"Key": key,
"useful": useful,
}
getAllValues(value, returnedArray, temp, useful)
else:
if key == "value":
tempValue["value"] = value
if key == "tags" and isinstance(value, list) and len(value) > 0:
tempValue["tags"] = value
returnedArray.append(tempValue)
return returnedArray
The above code should return:
[
{
"Key": "title",
"value": "403 Forbidden",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
{
"Key": "server",
"value": "Apache",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
]
Its a long post, but hopefully, someone can give me some assurance :)

How to extract specific nested JSON value doing loop? (Python)

{
"127.0.0.1":{
"addresses":{
"ipv4":"127.0.0.1"
},
"hostnames":[
{
"name":"localhost",
"type":"PTR"
}
],
"status":{
"reason":"conn-refused",
"state":"up"
},
"tcp":{
"5000":{
"conf":"10",
"cpe":"cpe:/a:python:python:3.9.2",
"extrainfo":"Python 3.9.2",
"name":"http",
"product":"Werkzeug httpd",
"reason":"syn-ack",
"script":{
"vulners":"\n cpe:/a:python:python:3.9.2: \n \tCVE-2021-29921\t7.5\thttps://vulners.com/cve/CVE-2021-29921\n \tCVE-2021-23336\t4.0\thttps://vulners.com/cve/CVE-2021-23336\n \tMSF:ILITIES/DEBIAN-CVE-2021-3426/\t2.7\thttps://vulners.com/metasploit/MSF:ILITIES/DEBIAN-CVE-2021-3426/\t*EXPLOIT*\n \tCVE-2021-3426\t2.7\thttps://vulners.com/cve/CVE-2021-3426"
},
"state":"open",
"version":"1.0.1"
},
"6000":{
"conf":"10",
"cpe":"cpe:/a:python:python:3.9.2",
"extrainfo":"Python 3.9.2",
"name":"http",
"product":"Werkzeug httpd",
"reason":"syn-ack",
"script":{
"vulners":"\n cpe:/a:python:python:3.9.2: \n \tCVE-2021-29921\t7.5\thttps://vulners.com/cve/CVE-2021-29921\n \tCVE-2021-23336\t4.0\thttps://vulners.com/cve/CVE-2021-23336\n \tMSF:ILITIES/DEBIAN-CVE-2021-3426/\t2.7\thttps://vulners.com/metasploit/MSF:ILITIES/DEBIAN-CVE-2021-3426/\t*EXPLOIT*\n \tCVE-2021-3426\t2.7\thttps://vulners.com/cve/CVE-2021-3426"
},
"state":"open",
"version":"1.0.1"
}
},
"vendor":{
}
}
}
I want to extract "vulners" value here i tried this -
results = []
for x in collection.find({},{"scan": 1, "_id": 0 }):
results.append(json.loads(json_util.dumps(x)))
portnumber = []
datay = []
datapro = []
for result in results:
ips = result['scan']
for ip in ips:
ports = result['scan'][ip]['tcp']
ipdomain = result['scan'][ip]['hostnames']
for ip2 in ipdomain:
ip3 = ip2['name']
for port in ports:
portnumber.append(port)
datax = ports[port]['script']
datay.append(datax)
datapro2 = ports[port]['product']
datapro.append(datapro2)
date = datetime.datetime.now()
date_now = date.strftime("%x, %X")
pass_json_var = {'domain': ip3, 'ports': portnumber, 'product': datapro, 'vulnerabilities': datay, "date": date_now}
if isinstance(pass_json_var, list):
domaindata.insert_many(pass_json_var)
else:
domaindata.insert_one(pass_json_var)
Ok so here if the "results" output gives me one "vulners" value then it works fine but when it's multiple ports with vulners values it doesn't work!
How can i access the 'vulners' value? Hoping for someone to guide me also a bit, Please try to give a solution which is dynamic
Thanks a lot!
Model based approach
this approach is based on a model of your data you want to parse. From my point of view this is more work in the beginning. With the advantage, that you will have clean error messages and you can control the behaviour by adapting your data model.
make a model of the data you want to parse
from typing import Any, Optional
from pydantic import BaseModel, Field
class ExScript(BaseModel):
vulners:str = ""
class Ex30000(BaseModel):
script:ExScript = Field(default=Any)
class ExTcp(BaseModel):
root:Ex30000= Field(default=Any, alias="30000")
class ExRoot(BaseModel):
tcp:ExTcp = Field() # Required
class Base(BaseModel):
root:ExRoot = Field(default=Any, alias="35.0.0.0.0")
change your input data to a raw string outherwise you will have to escape \n and \t
input_will_work = r"""{
"35.0.0.0.0": {
"hostnames": [
{
"name": "domain.com",
"type": "PTR"
}
],
"addresses": {
"ipv4": "35.0.0.0"
},
"vendor": {},
"status": {
"state": "up",
"reason": "syn-ack"
},
"tcp": {
"30000": {
"state": "open",
"reason": "syn-ack",
"name": "http",
"product": "nginx",
"version": "1.20.0",
"extrainfo": "",
"conf": "10",
"cpe": "cpe:/a:igor_sysoev:nginx:1.20.0",
"script": {
"http-server-header": "nginx/1.20.0",
"vulners": "\n cpe:/a:igor_sysoev:nginx:1.20.0: \n \tNGINX:CVE-2021-23017\t6.8\thttps://vulners.com/nginx/NGINX:CVE-2021-23017\n \t9A14990B-D52A-56B6-966C-6F35C8B8EB9D\t6.8\thttps://vulners.com/githubexploit/9A14990B-D52A-56B6-966C-6F35C8B8EB9D\t*EXPLOIT*\n \t1337DAY-ID-36300\t6.8\thttps://vulners.com/zdt/1337DAY-ID-36300\t*EXPLOIT*\n \tPACKETSTORM:162830\t0.0\thttps://vulners.com/packetstorm/PACKETSTORM:162830\t*EXPLOIT*"
}
}
}
}
}
"""
input_will_fail = r"""{
"35.0.0.0.0": {}
}
"""
3.1 this should give you the expected result
obj1 = Base.parse_raw(input_will_work)
print(obj1.root.tcp.root.script.vulners)
3.2 this should throw an exception
obj2 = Base.parse_raw(input_will_fail)
Search data with jsonpath
should return all objects with the name vulners
from jsonpath_ng import jsonpath, parse
import json
obj = json.loads(input_will_work)
p = parse('$..vulners')
for match in p.find(obj):
print(match.value)
Update:
def extract_data(ip_address_data):
domains = ip_address_data["hostnames"]
ports_data = []
# Each port can have different products and vulners
# So that data is grouped together in a dictionary
for port in ip_address_data["tcp"].keys():
port_data = ip_address_data["tcp"][port]
product = port_data["product"]
vulners = port_data['script']['vulners']
ports_data.append({
"port": port,
"product": product,
"vulners": vulners
})
return {
"domains": domains,
"ports_data": ports_data
}
# Result is the data from mongo db
# result = collection.find({})["scan"]
result = {
"127.0.0.1": {
"addresses": {
"ipv4": "127.0.0.1"
},
"hostnames": [
{
"name": "localhost",
"type": "PTR"
}
],
"status": {
"reason": "conn-refused",
"state": "up"
},
"tcp": {
"5000": {
"conf": "10",
"cpe": "cpe:/a:python:python:3.9.2",
"extrainfo": "Python 3.9.2",
"name": "http",
"product": "Werkzeug httpd",
"reason": "syn-ack",
"script": {
"vulners": "\n cpe:/a:python:python:3.9.2: \n \tCVE-2021-29921\t7.5\thttps://vulners.com/cve/CVE-2021-29921\n \tCVE-2021-23336\t4.0\thttps://vulners.com/cve/CVE-2021-23336\n \tMSF:ILITIES/DEBIAN-CVE-2021-3426/\t2.7\thttps://vulners.com/metasploit/MSF:ILITIES/DEBIAN-CVE-2021-3426/\t*EXPLOIT*\n \tCVE-2021-3426\t2.7\thttps://vulners.com/cve/CVE-2021-3426"
},
"state": "open",
"version": "1.0.1"
},
"6000": {
"conf": "10",
"cpe": "cpe:/a:python:python:3.9.2",
"extrainfo": "Python 3.9.2",
"name": "http",
"product": "Werkzeug httpd",
"reason": "syn-ack",
"script": {
"vulners": "\n cpe:/a:python:python:3.9.2: \n \tCVE-2021-29921\t7.5\thttps://vulners.com/cve/CVE-2021-29921\n \tCVE-2021-23336\t4.0\thttps://vulners.com/cve/CVE-2021-23336\n \tMSF:ILITIES/DEBIAN-CVE-2021-3426/\t2.7\thttps://vulners.com/metasploit/MSF:ILITIES/DEBIAN-CVE-2021-3426/\t*EXPLOIT*\n \tCVE-2021-3426\t2.7\thttps://vulners.com/cve/CVE-2021-3426"
},
"state": "open",
"version": "1.0.1"
}
},
"vendor": {
}
}
}
def scandata():
for ip_address in result:
ip_address_data = extract_data(
result[ip_address]
)
print(ip_address, ip_address_data)
scandata()
def extract_data(ip_address_data):
domains = ip_address_data["hostnames"]
ports_data = []
# Each port can have different products and vulners
# So that data is grouped together in a dictionary
for port in ip_address_data["tcp"].keys():
port_data = ip_address_data["tcp"][port]
product = port_data["product"]
vulners = port_data['script']['vulners']
ports_data.append({
"port": port,
"product": product,
"vulners": vulners
})
return {
"domains": domains,
"ports_data": ports_data
}
#app.route('/api/vulnerableports', methods=['GET'])
def show_vulnerableports():
data = []
resultz = []
for x in collection.find({}, {"scan": 1, "_id": 0}):
resultz.append(json.loads(json_util.dumps(x)))
for resultx in resultz:
result = resultx['scan']
for ip_address in result:
ip_address_data = extract_data(
result[ip_address]
)
data.append({ip_address: ip_address_data})
return jsonify(data)
So this is the solution! i had to iterate over script and then access vulner!

JSON dump appending random characters to end of file

I am writing a parser that goes through a list of data that is roughly formatted:
{
"teachers": [
{
"fullName": "Testing",
"class": [
{
"className": "Counselor",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
]
},
...
}
The parser is supposed to check for duplicate names within this json object, and when it stumbles upon said duplicate name, append the class to the class array.
So for example:
{
"teachers": [
{
"fullName": "Testing",
"class": [
{
"className": "Counselor",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
]
},
{
"fullName": "Testing",
"class": [
{
"className": "Math 8",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
]
},
...
}
Would return
{
"teachers": [
{
"fullName": "Testing",
"class": [
{
"className": "Counselor",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
},
{
"className": "Math 8",
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
},
]
},
...
}
My current parser works just fine for most objects, however for some reason it doesn't catch some of the duplicates despite the names being the exact same, and also is appending the string
}7d-48d6-b0b5-3d44ce4da21c"
}
}
]
}
]
to the end of the json document. I am not sure why it would do this considering I am just dumping the modified json (which only is modified within the array).
My parser code is:
i_duplicates = []
name_duplicates = []
def converter():
global i_duplicates
file = open("final2.json", "r+")
infinite = json.load(file)
for i, teacher in enumerate(infinite["teachers"]):
class_name = teacher["class"][0]["className"]
class_data = {
"className": class_name,
"school": {
"id": "2b6671cb-617d-48d6-b0b5-3d44ce4da21c"
}
}
d = {
"fullName": teacher["fullName"],
"index": i
}
c = {
"fullName": teacher["fullName"]
}
if c in name_duplicates:
infinite["teachers"][search(i_duplicates, c["fullName"])]["class"].append(class_data)
infinite["teachers"].pop(i)
file.seek(0)
json.dump(infinite, file, indent=4)
else:
i_duplicates.append(d)
name_duplicates.append(c)
def search(a, t):
for i in a:
if i["fullName"] == t:
return i["index"]
print(Fore.RED + "not found" + Fore.RESET)
I know I am going about this inefficiently, but I am not sure how to fix the issues the current algorithm is having. Any feedback appreciated.

Simplify MongoEngine QuerySet Results

The Problem
I have an API that returns mongoengine results in JSON format, but the frontend (outside of my scope) is unhappy with the format, namely that values like {"$oid": "60e4a07097ddf1e20fbaaaa0"} exist.
Here's an example response from hitting a GET endpoint:
[
{
"name": "Double R Diner",
"permissions": [
{
"user": {
"$oid": "60de35e078ba4f6f3f41e519"
},
"role": {
"name": "admin",
"description": "account creator"
}
},
{
"user": {
"$oid": "60de35e078ba4f6f3f41e516"
},
"role": {
"name": "admin",
"description": "Double R Diner admin"
}
}
],
"buildings": [
{
"$oid": "60de35e078ba4f6f3f41e51e"
}
],
"address": {
"country": "United States",
"province_state": "WA",
"city": "Twin Peaks"
},
"main_contact": {
"$oid": "60de35e078ba4f6f3f41e519"
},
"_id": {
"$oid": "60de35e078ba4f6f3f41e520"
}
}
]
The nesting of $oid fields is where the problem lies, as the frontend considers this an over-complication that locks us into this weird mongoengine format standard. This nesting should be removed, with the $oid string values replacing their nesting dict where possible. As you can see, nesting can occur within dicts (main_contact) and/or lists (buildings).
The desired outcome looks like this:
[
{
"name": "Double R Diner",
"permissions": [
{
"user": "60de35e078ba4f6f3f41e519",
"role": {
"name": "admin",
"description": "account creator"
}
},
{
"user": "60de35e078ba4f6f3f41e516",
"role": {
"name": "admin",
"description": "Double R Diner admin"
}
}
],
"buildings": [
"60de35e078ba4f6f3f41e51e"
],
"address": {
"country": "United States",
"province_state": "WA",
"city": "Twin Peaks"
},
"main_contact": "60de35e078ba4f6f3f41e519",
"_id": "60de35e078ba4f6f3f41e520"
}
]
I think this is a recursion problem, and recursion breaks my brain.
The Question
How can I best achieve the above transformation; simplifying the response data structure and allowing the front end to be unaware of mongoengine $oid dicts regardless of how deeply nested these ObjectIds are?
I'm sure there is a much better way to do this, but I seem to have solved this problem with the following implementation:
def simplify_ids(obj):
if type(obj) not in [list, dict]:
return obj
if type(obj) == dict:
if len(obj) == 1 and '$oid' in obj.keys():
obj = obj['$oid']
return obj
else:
for key, value in obj.items():
obj[key] = simplify_ids(value)
elif type(obj) == list:
for index, nested_obj in enumerate(obj):
obj[index] = simplify_ids(nested_obj)
return obj
def clean_query_results(query_results):
if type(query_results) != QuerySet:
raise TypeError(f'Input must be a mongoengine QuerySet, not {type(query_results)}')
result_string = query_results.to_json()
loaded = json.loads(result_string) # List
cleaned = list()
for obj in loaded:
cleaned.append(simplify_ids(obj))
return json.dumps(cleaned)
The calling endpoint needs only to change the Response line from:
return Response(query_results.to_json(), mimetype='application/json', status=200)
to:
return Response(clean_query_results(query_results), mimetype='application/json', status=200)

duplicates in a JSON file based on two attributes

I have a JSON file and that is a nested JSON. I would like to remove duplicates based on two keys.
JSON example:
"books": [
{
"id": "1",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
},
{
"name": "Peter",
"main": 0
}
]
}
]
},
{
"id": "2",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "Jeroge",
"main": 1
},
{
"name": "Peter",
"main": 0
},
{
"name": "John",
"main": 0
}
]
}
]
},
{
"id": "3",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
}
]
}
]
}
]
Here I try to match the title and author name. For example, for id 1 and id 2 are duplicates( as the title is same and author names are also same(the author sequence doesn't matter and no need to consider the main attributes). So, in the output JSON only id:1 or id:2 will remain with id:3. In the final output I need two file.
Output_JSON:
"books": [
{
"id": "1",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
},
{
"name": "Peter",
"main": 0
}
]
}
]
},
{
"id": "3",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
}
]
}
]
}
]
duplicatedID.csv:
1-2
The following method I tried but it is not giving correct results:
list= []
duplicate_Id = []
for data in (json_data['books'])[:]:
elements= []
id = data['id']
title = data['story']['title']
elements.append(title)
for i in (data['description'][0]['author']):
name = (i['name'])
elements.append(name)
if not list:
list.append(elements)
else:
for j in list:
if set(elements) == set(j):
duplicate_Id.append(id)
elements = []
The general idea is to:
Get the groups identified by some function that collects duplicates.
Then return the first entry of each group, ensuring no duplicates.
Define the key function as the sorted list of authors and. As the list of authors is by definition the unique key, but may appear in any order.
import json
from itertools import groupby
j = json.load(books)
def transform(books):
groups = [list(group) for _, group in groupby(books, key=getAuthors)]
return [group[0] for group in groups]
def getAuthors(book):
authors = book['description'][0]['author']
return sorted([author['name'] for author in authors])
print(transform(j['books']))
If we wanted to get the duplicates, then we do the same computation, but return any sublist with length > 1 as this is by our definition duplicated data.
def transform(books):
groups = [list(group) for _, group in groupby(books, key=getAuthors)]
return [group for group in groups if len(group) > 1]
Where j['books'] is the JSON you gave enclosed in an object.

Categories

Resources