Python manipulating json, lists, and dictionaries - python

Sorry for the length but tried to be complete.
I'm trying to get the following data -
(only small sampling from a much larger json file, same structure)
{
"count": 394,
"status": "ok",
"data": [
{
"md5": "cd042ba78d0810d86755136609793d6d",
"threatscore": 90,
"threatlevel": 0,
"avdetect": 0,
"vxfamily": "",
"domains": [
"dynamicflakesdemo.com",
"www.bountifulbreast.co.uk"
],
"hosts": [
"66.33.214.180",
"64.130.23.5",
],
"environmentId": "1",
},
{
"md5": "4f3a560c8deba19c5efd48e9b6826adb",
"threatscore": 65,
"threatlevel": 0,
"avdetect": 0,
"vxfamily": "",
"domains": [
"px.adhigh.net"
],
"hosts": [
"130.211.155.133",
"65.52.108.163",
"172.225.246.16"
],
"environmentId": "1",
}
]
}
if "threatscore" is over 70 I want to add it to this json structure -
Ex.
"data": [
{
"md5": "cd042ba78d0810d86755136609793d6d",
"threatscore": 90,
{
"Event":
{"date":"2015-11-25",
"threat_level_id":"1",
"info":"HybridAnalysis",
"analysis":"0",
"distribution":"0",
"orgc":"SOC",
"Attribute": [
{"type":"ip-dst",
"category":"Network activity",
"to_ids":True,
"distribution":"3",
"value":"66.33.214.180"},
{"type":"ip-dst",
"category":"Network activity",
"to_ids":True,
"distribution":"3",
"value":"64.130.23.5"}
{"type":"domain",
"category":"Network activity",
"to_ids":True,
"distribution":"3",
"value":"dynamicflakesdemo.com"},
{"type":"domain",
"category":"Network activity",
"to_ids":True,
"distribution":"3",
"value":"www.bountifulbreast.co.uk"}
{"type":"md5",
"category":"Payload delivery",
"to_ids":True,
"distribution":"3",
"value":"cd042ba78d0810d86755136609793d6d"}]
}
}
This is my code -
from datetime import datetime
import os
import json
from pprint import pprint
now = datetime.now()
testFile = open("feed.json")
feed = json.load(testFile)
for x in feed['data']:
if x['threatscore'] > 90:
data = {}
data['Event']={}
data['Event']["date"] = now.strftime("%Y-%m-%d")
data['Event']["threat_level_id"] = "1"
data['Event']["info"] = "HybridAnalysis"
data['Event']["analysis"] = 0
data['Event']["distribution"] = 3
data['Event']["orgc"] = "Malware"
data['Event']["Attribute"] = []
if 'hosts' in x:
data['Event']["Attribute"].append({'type': "ip-dst"})
data['Event']["Attribute"][0]["category"] = "Network activity"
data['Event']["Attribute"][0]["to-ids"] = True
data['Event']["Attribute"][0]["distribution"] = "3"
data["Event"]["Attribute"][0]["value"] =x['hosts']
if 'md5' in x:
data['Event']["Attribute"].append({'type': "md5"})
data['Event']["Attribute"][1]["category"] = "Payload delivery"
data['Event']["Attribute"][1]["to-ids"] = True
data['Event']["Attribute"][1]["distribution"] = "3"
data['Event']["Attribute"][1]['value'] = x['md5']
if 'domains' in x:
data['Event']["Attribute"].append({'type': "domain"})
data['Event']["Attribute"][2]["category"] = "Network activity"
data['Event']["Attribute"][2]["to-ids"] = True
data['Event']["Attribute"][2]["distribution"] = "3"
data['Event']["Attribute"][2]["value"] = x['domains']
attributes = data["Event"]["Attribute"]
data["Event"]["Attribute"] = []
for attribute in attributes:
for value in attribute["value"]:
if value == " ":
pass
else:
new_attr = attribute.copy()
new_attr["value"] = value
data["Event"]["Attribute"].append(new_attr)
pprint(data)
with open('output.txt', 'w') as outfile:
json.dump(data, outfile)
And now it seems to be cleaned up a little but the data['md5'] is being split on each letter and I think it's just like L3viathan said earlier I keep overwriting the first element in the dictionary... but I'm not sure how to get it to keep appending???
{'Event': {'Attribute': [{'category': 'Network activity',
'distribution': '3',
'to-ids': True,
'type': 'ip-dst',
'value': u'216.115.96.174'},
{'category': 'Network activity',
'distribution': '3',
'to-ids': True,
'type': 'ip-dst',
'value': u'64.4.54.167'},
{'category': 'Network activity',
'distribution': '3',
'to-ids': True,
'type': 'ip-dst',
'value': u'63.250.200.37'},
{'category': 'Payload delivery',
'distribution': '3',
'to-ids': True,
'type': 'md5',
'value': u'7'},
{'category': 'Payload delivery',
'distribution': '3',
'to-ids': True,
'type': 'md5',
'value': u'1'},
And still getting the following error in the end:
Traceback (most recent call last):
File "hybridanalysis.py", line 34, in
data['Event']["Attribute"][1]["category"] = "Payload delivery"
IndexError: list index out of range
The final goal is to get it set so that I can post the events into MISP but they have to go one at a time.

I think this should fix your problems. I added the attribute dictionary all in one go, and moved the data in a list (which is more appropriate), but you might want to remove the superfluous list which wraps the Events.
from datetime import datetime
import os
import json
from pprint import pprint
now = datetime.now()
testFile = open("feed.json")
feed = json.load(testFile)
data_list = []
for x in feed['data']:
if x['threatscore'] > 90:
data = {}
data['Event']={}
data['Event']["date"] = now.strftime("%Y-%m-%d")
data['Event']["threat_level_id"] = "1"
data['Event']["info"] = "HybridAnalysis"
data['Event']["analysis"] = 0
data['Event']["distribution"] = 3
data['Event']["orgc"] = "Malware"
data['Event']["Attribute"] = []
if 'hosts' in x:
data['Event']["Attribute"].append({
'type': 'ip-dst',
'category': 'Network activity',
'to-ids': True,
'distribution': '3',
'value': x['hosts']})
if 'md5' in x:
data['Event']["Attribute"].append({
'type': 'md5',
'category': 'Payload delivery',
'to-ids': True,
'distribution': '3',
'value': x['md5']})
if 'domains' in x:
data['Event']["Attribute"].append({
'type': 'domain',
'category': 'Network activity',
'to-ids': True,
'distribution': '3',
'value': x['domains']})
attributes = data["Event"]["Attribute"]
data["Event"]["Attribute"] = []
for attribute in attributes:
for value in attribute["value"]:
if value == " ":
pass
else:
new_attr = attribute.copy()
new_attr["value"] = value
data["Event"]["Attribute"].append(new_attr)
data_list.append(data)
with open('output.txt', 'w') as outfile:
json.dump(data_list, outfile)

In the json, "Attiribute" Holds the value of a list with a 1 item, a dict, in it, as shown here.
{'Event': {'Attribute': [{'category': 'Network activity',
'distribution': '3',
'to-ids': True,
'type': 'ip-dst',
'value': [u'54.94.221.70']}]
...
When you call data['Event']["Attribute"][1]["category"] you are getting the second item (index 1) in the list of attribute, while it only has one item, which is why you are getting the error.

Thanks L3viathan! Below is how I tweaked it to not iterate over MD5's.
attributes = data["Event"]["Attribute"]
data["Event"]["Attribute"] = []
for attribute in attributes:
if attribute['type'] == 'md5':
new_attr = attribute.copy()
new_attr["value"] = str(x['md5'])
data["Event"]["Attribute"].append(new_attr)
else:
for value in attribute["value"]:
new_attr = attribute.copy()
new_attr["value"] = value
data["Event"]["Attribute"].append(new_attr)
data_list.append(data)
Manipulating json seems to be the way to go to learn lists and dictionaries.

Related

Why does "sort" appear in this JSON of elasticsearch?

I am indexing data into Elasticsearch.
I do not know what is "sort". I have not put it in the mapping and it is neither in the data that I am indexing.
Why does it appear?
THIS IS MY CODE
def initialize_mapping(es):
mapping_classification = {
'properties': {
'#timestamp': {'type': 'date'},
'Labels': {'type': 'keyword'},
'Model': {'type': 'keyword'},
'Image': {'type': 'keyword'},
'Time(ms)': {'type': 'short'},
'Inference': {'type': 'text'},
'Score': {'type': 'short'},
'TPU_temp(°C)': {'type': 'short'}
}
}
print("Initializing the mapping ...")
if not es.indices.exists(INDEX_NAME):
es.indices.create(INDEX_NAME)
es.indices.put_mapping(body=mapping_classification, doc_type=DOC_TYPE, index=INDEX_NAME)
def main():
es=initialize_elasticsearch()
initialize_mapping(es)
actions = [
{
'_index': INDEX_NAME,
'_type': DOC_TYPE,
"#timestamp": str(datetime.datetime.utcnow().strftime("%Y-%m-%d"'T'"%H:%M:%S")),
"Labels": maX_group[0].split(":")[1],
"Model": maX_group[1].split(":")[1],
"Image": maX_group[2].split(":")[1],
"Time(ms)": maX_group[4].split(":")[1],
"Inference": maX_group[5].split(":")[1],
"Score": maX_group[6].split(":")[1],
"TPU_temp(°C)": maX_group[7].split(":")[1]
}]
try:
res=helpers.bulk(client=es, index = INDEX_NAME, actions = actions)
print ("\nhelpers.bulk() RESPONSE:", res)
print ("RESPONSE TYPE:", type(res))
except Exception as err:
print("\nhelpers.bulk() ERROR:", err)
if __name__ == "__main__":
main()
That sort value is not in your document at all. Only what you see in _source is actually your document.
In your other question, you might have created an index-pattern without specifying any #timestamp field, and hence the documents where not sorted in the Discover view and you didn't see any sort value.

How to iterate over a JSON array and get values for a key which itself is a JSON object

I have been trying to do something simple yet something hard for me to solve it!
I have a json object that looks like:
jsonObject = {
'attributes': {
'192': { <--- This can be changed times to times meaning different number
'id': '192',
'code': 'hello',
'label': 'world',
'options': [
{
'id': '211',
'label': '5'
},
{
'id': '1202',
'label': '8.5'
},
{
'id': '54',
'label': '9'
},
{
'id': '1203',
'label': '9.5'
},
{
'id': '58',
'label': '10'
}
]
}
},
'template': '12345',
'basePrice': '51233',
'oldPrice': '51212',
'productId': 'hello',
}
and what I want to do is to get the values from options (To have both id and label saved into a list)
For now I only managed to do:
for att, value in jsonObject.items():
print(f"{att} - {value}"
How can I get the label and id?
You can try the following code:
attr = jsonObject['attributes']
temp = list(attr.values())[0] # It is same as "temp = attr['192']", but you said '192' can be changed.
options = temp['options']
for option in options:
print(f"id: {option['id']}, label: {option['label']}")

Replacement for dataframe.iterrows()

I'am working on a script for migrating data from MongoDB to Clickhouse. Because of the reason that nested structures are'nt implemented good enough in Clickhouse, I iterate over nested structure and bring them to flat representation, where every element of nested structure is a distinct row in Clickhouse database.
What I do is iterate over list of dictionaries and take target values. The structure looks like this:
[
{
'Comment': None,
'Details': None,
'FunnelId': 'MegafonCompany',
'IsHot': False,
'IsReadonly': False,
'Name': 'Новый',
'SetAt': datetime.datetime(2018, 4, 20, 10, 39, 55, 475000),
'SetById': 'ekaterina.karpenko',
'SetByName': 'Екатерина Карпенко',
'Stage': {
'Label': 'Новые',
'Order': 0,
'_id': 'newStage'
},
'Tags': None,
'Type': 'Unknown',
'Weight': 120,
'_id': 'new'
},
{
'Comment': None,
'Details': {
'Name': 'взят в работу',
'_id': 1
},
'FunnelId': 'MegafonCompany',
'IsHot': False,
'IsReadonly': False,
'Name': 'В работе',
'SetAt': datetime.datetime(2018, 4, 20, 10, 40, 4, 841000),
'SetById': 'ekaterina.karpenko',
'SetByName': 'Екатерина Карпенко',
'Stage': {
'Label': 'Приглашение на интервью',
'Order': 1,
'_id': 'recruiterStage'
},
'Tags': None,
'Type': 'InProgress',
'Weight': 80,
'_id': 'phoneInterview'
}
]
I have a function that does this on dataframe object via data.iterrows() method:
def to_flat(data, coldict, field_last_upd):
m_status_history = stc.special_mongo_names['status_history_cand']
n_statuse_change = coldict['n_statuse_change']['name']
data[n_statuse_change] = n_status_change(dp.force_take_series(data, m_status_history))
flat_cols = [ x for x in coldict.values() if x['coltype'] == stc.COLTYPE_FLAT ]
old_cols_names = [ x['name'] for x in coldict.values() if x['coltype'] == stc.COLTYPE_PREPARATION ]
t_time = time.time()
t_len = 0
new_rows = list()
for j in range(row[n_statuse_change]):
t_new_value_row = np.empty(shape=[0, 0])
for k in range(len(flat_cols)):
if flat_cols[k]['colsubtype'] == stc.COLSUBTYPE_FLATPATH:
new_value = dp.under_value_line(
row,
path_for_status(j, row[n_statuse_change]-1, flat_cols[k]['path'])
)
# Дополнительно обрабатываем дату
if flat_cols[k]['name'] == coldict['status_set_at']['name']:
new_value = dp.iso_date_to_datetime(new_value)
if flat_cols[k]['name'] == coldict['status_set_at_mil']['name']:
new_value = dp.iso_date_to_miliseconds(new_value)
if flat_cols[k]['name'] == coldict['status_stage_order']['name']:
try:
new_value = int(new_value)
except:
new_value = new_value
else:
if flat_cols[k]['name'] == coldict['status_index']['name']:
new_value = j
t_new_value_row = np.append(t_new_value_row, dp.some_to_null(new_value))
new_rows.append(np.append(row[old_cols_names].values, t_new_value_row))
pdb.set_trace()
res = pd.DataFrame(new_rows, columns = [
x['name'] for x in coldict.values() if x['coltype'] == stc.COLTYPE_FLAT or x['coltype'] == stc.COLTYPE_PREPARATION
])
return res
It takes values from list of dicts, prepare them to correspond Clickhouse's requirements using numpy arrays and then appends them all together to get new dataframe with targeted values and its columnnames.
I've noticed that if nested structure is big enough, it begins to work much slower. I've found an article where different methods of iteration in Python are compared. article
It is claimed that it's much faster to iterate over .apply() method and even faster using vectorization. But the samples given are pretty trivial and rely on using the same function on all of the values. Is it possible to iterate over pandas object in faster manner, while using variety of functions on different types of data?
I think your first step should be converting your data into a pandas dataframe, then it will be so much easier to handle it. I couldn't deschiper the exact functions you wanted to run, but perhaps my example helps
import datetime
import pandas as pd
data_dict_array = [
{
'Comment': None,
'Details': None,
'FunnelId': 'MegafonCompany',
'IsHot': False,
'IsReadonly': False,
'Name': 'Новый',
'SetAt': datetime.datetime(2018, 4, 20, 10, 39, 55, 475000),
'SetById': 'ekaterina.karpenko',
'SetByName': 'Екатерина Карпенко',
'Stage': {
'Label': 'Новые',
'Order': 0,
'_id': 'newStage'
},
'Tags': None,
'Type': 'Unknown',
'Weight': 120,
'_id': 'new'
},
{
'Comment': None,
'Details': {
'Name': 'взят в работу',
'_id': 1
},
'FunnelId': 'MegafonCompany',
'IsHot': False,
'IsReadonly': False,
'Name': 'В работе',
'SetAt': datetime.datetime(2018, 4, 20, 10, 40, 4, 841000),
'SetById': 'ekaterina.karpenko',
'SetByName': 'Екатерина Карпенко',
'Stage': {
'Label': 'Приглашение на интервью',
'Order': 1,
'_id': 'recruiterStage'
},
'Tags': None,
'Type': 'InProgress',
'Weight': 80,
'_id': 'phoneInterview'
}
]
#converting your data into something pandas can read
# in particular, flattening the stage dict
for data_dict in data_dict_array:
d_temp = data_dict.pop("Stage")
data_dict["Stage_Label"] = d_temp["Label"]
data_dict["Stage_Order"] = d_temp["Order"]
data_dict["Stage_id"] = d_temp["_id"]
df = pd.DataFrame(data_dict_array)
# lets say i want to set comment to "cool" if name is 'В работе'
# in .loc[], the first argument is filtering the rows, the second argument is picking the column
df.loc[df['Name'] == 'В работе','Comment'] = "cool"
df

Create 2 records from JSON Array having Structs

I have a JSON array which is in format below:
{
"id": "1",
"active": "True",
"gender": "female",
"coding": [
{
"system": "http://loinc.org",
"code": "8310-5",
"display": "Body temperature"
},
{
"system": "http://loinc.org",
"code": "8716-3",
"display": "Vital Signs grouping"
}
]
}
- I need output as two records. is it possible can someone help me with the Python code
{"id": "1","active": "True","gender": "female",{"system": "http://loinc.org","code": "8310-5","display": "Body temperature"},
{"id": "1","active": "True","gender": "female",{"system": "http://loinc.org","code": "8716-3","display": "Vital Signs grouping"}
I'm going to assume you want the codings in their own key since your question wasn't clear
import json
obj = json.loads(s) # where s is your json string
objs = [] # where we will store the results
for coding in obj['coding']:
new_obj = obj.copy()
new_obj['coding'] = coding # set the coding entry to one coding
objs.append(new_obj)
Output of objs:
[{'active': 'True',
'coding': {'code': '8310-5',
'display': 'Body temperature',
'system': 'http://loinc.org'},
'gender': 'female',
'id': '1'},
{'active': 'True',
'coding': {'code': '8716-3',
'display': 'Vital Signs grouping',
'system': 'http://loinc.org'},
'gender': 'female',
'id': '1'}]
If you want just a flat dict then
objs = []
for coding in obj['coding']:
new_obj = obj.copy()
del new_obj['coding']
new_obj.update(coding)
objs.append(new_obj)
Now objs is:
[{'active': 'True',
'code': '8310-5',
'display': 'Body temperature',
'gender': 'female',
'id': '1',
'system': 'http://loinc.org'},
{'active': 'True',
'code': '8716-3',
'display': 'Vital Signs grouping',
'gender': 'female',
'id': '1',
'system': 'http://loinc.org'}]
You can do it like this:
import json
input_dict = json.loads(myjson)
base = input_dict.copy()
base.pop('coding')
output = [dict(base, **c) for c in input_dict['coding']]
print(output)
Output:
[{'active': 'True', 'code': '8310-5', 'display': 'Body temperature', 'gender': 'female', 'id': '1', 'system': 'http://loinc.org'},
{'active': 'True', 'code': '8716-3', 'display': 'Vital Signs grouping', 'gender': 'female', 'id': '1', 'system': 'http://loinc.org'}]

Building a dictionary from directory structure

I'm trying to build a dictionary that looks like this:
nodes = {
'var': {
'type': 'd',
'full_path': '/var'
'active': True
'www': {
'type': 'd',
'full_path': '/var/www',
'active': True
'index.html': {
'type': 'f',
'full_path': '/var/www/index.html',
'active': False
}
'log': {
'type': 'd',
'full_path': '/var/log',
'active': False
}
}
'srv': {
'type': 'd',
'full_path': '/srv',
'active': True
}
}
I need it to be built by two pieces... The first needs to be from the file system where everything is 'active'. The second needs to come from a listing of full paths of files where everything is inactive.
So...
nodes = {}
for f, d, r in os.walk(root_path):
# append active items to nodes
for f in os.system(command_that_gets_files)
# append inactive items to nodes; not overwriting active
I'm sure I'm missing details...
Here's one way to get the active files. I found it easier to recurse than to use os.walk()'s iterative data. You may uncomment the result['stat'] line if you need to preserve more information than file type.
Every file has a dict entry like:
filename : { 'active' : True,
'full_path' = '/path/to/filename',
'type' : 'f' }
Every directory has a dict entry like:
dirname : { 'active' : True,
'full_path' = '/path/to/dirname',
'type' : 'd',
items = { 'itemname' : {...}, ... } }
Here you go:
import sys
import os
from stat import *
import pprint
def PathToDict(path):
st = os.stat(path)
result = {}
result['active'] = True
#result['stat'] = st
result['full_path'] = path
if S_ISDIR(st.st_mode):
result['type'] = 'd'
result['items'] = {
name : PathToDict(path+'/'+name)
for name in os.listdir(path)}
else:
result['type'] = 'f'
return result
pprint.pprint(PathToDict(sys.argv[1]))
Result:
{'active': True,
'full_path': '/tmp/x',
'items': {'var': {'active': True,
'full_path': '/tmp/x/var',
'items': {'log': {'active': True,
'full_path': '/tmp/x/var/log',
'items': {},
'type': 'd'},
'www': {'active': True,
'full_path': '/tmp/x/var/www',
'items': {'index.html': {'active': True,
'full_path': '/tmp/x/var/www/index.html',
'type': 'f'}},
'type': 'd'}},
'type': 'd'}},
'type': 'd'}

Categories

Resources