Python - appending json file - python

I am trying to append the json files from a folder into variable so i can parse it out later. Here is the code I have:
# Importing dependencies
import os
import shutil
import glob
from zipfile import ZipFile
from datetime import datetime
import zipfile
import json
from pandas.io.json import json_normalize
import urllib
import sqlalchemy as sa
# Define the folder sources and destinations
MainDir = 'C:/Test/'
LoadingDir = 'C:/Test/Loading/'
ArchiveDir = 'C:/Test/Archive/'
glob_data = []
# Look for all json files in directory
for file in glob.glob(LoadingDir + '*.json'):
with open(file) as json_file:
# Load each json file and append it
data = json.load(json_file)
i = 0
while i < len(data):
glob_data.append(data[i])
i += 1
with open(LoadingDir + 'Combined.json', 'w') as f:
json.dump(glob_data, f, indent=4)
# Load Json file for parsing
file = open(LoadingDir + 'Combined.json')
data = json.load(file)
# Parsing of data
df = json_normalize(data,meta=['timestamp'])
df.to_csv(LoadingDir + "Combined.csv",sep=',', encoding='utf-8')
try:
df.to_csv(LoadingDir + "Combined.csv",sep=',', encoding='utf-8')
except:
pass
When I try running it I get this message below:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-1-ea0f48aa463e> in <module>
24 i = 0
25 while i < len(data):
---> 26 glob_data.append(data[i])
27 i += 1
28 with open(LoadingDir + 'Combined.json', 'w') as f:
KeyError: 0
Here is the sample of my Json file:
{
"sensor-time" : {
"timezone" : "America/Los_Angeles",
"time" : "2019-11-05T14:18:36-08:00"
},
"status" : {
"code" : "OK"
},
"content" : {
"element" : [ {
"element-id" : 0,
"element-name" : "Line 0",
"sensor-type" : "SINGLE_SENSOR",
"data-type" : "LINE",
"from" : "2019-11-01T00:00:00-07:00",
"to" : "2019-11-05T15:00:00-08:00",
"resolution" : "ONE_HOUR",
"measurement" : [ {
"from" : "2019-11-01T00:00:00-07:00",
"to" : "2019-11-01T01:00:00-07:00",
"value" : [ {
"value" : 0,
"label" : "fw"
}, {
"value" : 0,
"label" : "bw"
} ]
}, {
"from" : "2019-11-01T01:00:00-07:00",
"to" : "2019-11-01T02:00:00-07:00",
"value" : [ {
"value" : 0,
"label" : "fw"
}, {
"value" : 0,
"label" : "bw"
} ]
}, {
"from" : "2019-11-01T02:00:00-07:00",
"to" : "2019-11-01T03:00:00-07:00",
"value" : [ {
"value" : 0,
"label" : "fw"
}, {
"value" : 0,
"label" : "bw"
} ]
},
So what I noticed is that this json file does not start with [ which means its not list of dictionaries. But when i have json that does start with [ my code does work.
How do I adjust this to work for this sample of json?

Change your code to:
import os
import shutil
import glob
from zipfile import ZipFile
from datetime import datetime
import zipfile
import json
from pandas.io.json import json_normalize
import urllib
import sqlalchemy as sa
# Define the folder sources and destinations
MainDir = 'C:/Test/'
LoadingDir = 'C:/Test/Loading/'
ArchiveDir = 'C:/Test/Archive/'
glob_data = []
# Look for all json files in directory
for file in glob.glob(LoadingDir + '*.json'):
with open(file) as json_file:
# Load each json file and append it
data = json.load(json_file)
glob_data.append(data)
with open(LoadingDir + 'Combined.json', 'w') as f:
json.dump(glob_data, f, indent=4)
# Load Json file for parsing
file = open(LoadingDir + 'Combined.json')
data = json.load(file)
# Parsing of data
df = json_normalize(data,meta=['timestamp'])
df.to_csv(LoadingDir + "Combined.csv",sep=',', encoding='utf-8')
try:
df.to_csv(LoadingDir + "Combined.csv",sep=',', encoding='utf-8')
except:
pass
You don't need to iterate over the return value returned by json.load(), it's already parsed and converted to a dict, just append it directly.

Related

json data into individual csv file

I have many json files under /json/reports/ location and for each json file the output need to be converted into csv file individually.
I have the following python code to convert.
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
jsonData = pd.read_json(f.read())
jsonData.to_csv(f.name+".csv")
f.close()
One of my json file (First few lines) output as follows.
[
{
"api_result": "KEY_NAME",
"ml_result": "VALUE",
"line_data_list": [
{
"line": "54A2FF607A6dsewroadeEOERD> |-",
"line_num": 9053,
"path": "/home/user/src/common/race/flow/prog_flow.mk",
"value": "WOERJFOQDKSDFKKASDF0",
"variable": null,
"entropy_validation": true
}
],
"ml_part": 0.994396984577179,
"rule": "GCP Client ID",
"severity": "high"
},
{
"api_result": "NOT_AVAILABLE",
"ml_result": "NOT_AVAILABLE",
"line_data_list": [
{
"line": "-----BEGIN Result-----",
"line_num": 19873,
"path": "/home/user/test/linux/ops/format.key",
"value": "-----BEGIN RSA PRIVATE",
"variable": null,
"entropy_validation": false
}
],
"ml_part": null,
"rule": "Certificate",
"severity": "low"
},
.....
.......
..........
Problem:-:-
The above python code writing line_data_list list values (line, line_num, path, value, variable, & entropy_validation) in single column, but I need each value in a seprate column. (Ie specified in below format).
Expected output csv per json file:-
Sl.no
api_result
ml_result
line_data_list
line
line_num
path
value
variable
entropy_validation
ml_part
rule
severity
1
KEY_NAME
VALUE
54A2FF607A6dsewroadeEOERD
9053
/home/user98/src/common/race/flow/prog_flow.mk
WOERJFOQDKSDFKKASDFO
null
TRUE
0.994396985
GCP Client ID
high
2
NOT_AVAILABLE
NOT_AVAILABLE
-----BEGIN Result-----
19873
/home/user/test/linux/ops/format.key
-----BEGIN RSA PRIVATE
null
false
null
Certificate
low
3
Need help to print each in separate column.
I have this json file:
df = pd.read_json("mydata.json")
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
pd.concat([df, t], axis=1)
In your case:
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
df = pd.read_json(file)
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
df = pd.concat([df, t], axis=1)
df.to_csv(f'{file[:-5]}.csv')
Output:
You need to unpack your line_data_list key-value pairs so they occur on the same level as your other columns. Something like what I've written below would work.
import pandas as pd
import glob
import json
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
json_dict = json.load(f)
line_data = json_dict[0].pop("line_data_list")
{json_dict.update(header, val) for header, val in line_data.items()}
jsonData = pd.from_dict(json_dict, orient="records")
jsonData.to_csv(f.name+".csv")
f.close()

JSONDecodeError: Expecting value: line 2 column 13 (char 15)

I have a nested json file which I got from json.
I am trying to convert it in to csv through python code.
I tried all the possible way to convert it to csv but couldn't succeed.
I also followed previous question and solution but didn't work for me.
My json format is
{
"d1" : ("value1"),
"d2" : (value2-int),
"d3" : [
{
"sub-d1" : sub-value1(int),
"sub-d2" : sub-value2(int),
"sub-d3" : sub-value3(int),
"sub-d4" : [
{
"sub-sub-d1" : "sub-sub-value3",
"sub-sub-d2" : sub-value3(int)
},
{
"sub-sub-d1" : sub-sub-value3(int),
"sub-sub-d2" : "sub-sub-value3"}
]
],
"sub-d5" : "sub-value4",
"sub-d6" : "sub-value5"
}
],
"d4" : "value3",
"d5" : "value4",
"d6" : "value5,
"d7" : "value6"
}
{ another entry with same pattern..and so on}
Some of the value and sub value has integers and str + int.
What I tried
import json
import csv
import requests
with open('./data/inverter.json', 'r') as myfile:
json_data = myfile.read()
def get_leaves(item, key=None):
if isinstance(item, dict):
leaves = {}
for i in item.keys():
leaves.update(get_leaves(item[i], i))
return leaves
elif isinstance(item, list):
leaves = {}
for i in item:
leaves.update(get_leaves(i, key))
return leaves
else:
return {key : item}
# First parse all entries to get the complete fieldname list
fieldnames = set()
for entry in json_data:
fieldnames.update(get_leaves(entry).keys())
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=sorted(fieldnames))
csv_output.writeheader()
csv_output.writerows(get_leaves(entry) for entry in json_data)
This one saves all my data in single column with split values.
I tried to use :
https://github.com/vinay20045/json-to-csv.git
but this also didn't work.
I also tried to parse and do simple trick with following code:
with open("./data/inverter.json") as data_file:
data = data_file.read()
#print(data)
data_content = json.loads(data)
print(data_content)
but it throws an error : 'JSONDecodeError: Expecting value: line 2 column 13 (char 15)'
Can any one help me to convert my nested json to csv ?
It would be appreciated.
Thank you
It looks like the NumberInt(234234) issue you describe was a bug in MongoDB: how to export mongodb without any wrapping with NumberInt(...)?
If you cannot fix it by upgrading MongoDB, I can recommend preprocessing the data with regular expressions and parsing it as regular JSON after that.
For the sake of example, let's say you've got "test.json" that looks like this, which is valid except for the NumberInt(...) stuff:
{
"d1" : "value1",
"d2" : NumberInt(1234),
"d3" : [
{
"sub-d1" : 123,
"sub-d2" : 123,
"sub-d3" : 123,
"sub-d4" : [
{
"sub-sub-d1" : "sub-sub-value3",
"sub-sub-d2" : NumberInt(123)
},
{
"sub-sub-d1" : 43242,
"sub-sub-d2" : "sub-sub-value3"
}
]
}
],
"d4" : "value3",
"d5" : "value4",
"d6" : "value5",
"d7" : "value6"
}
You could import this into Python as follows:
import re
import json
with open("test.json") as f:
data = f.read()
# This regular expression finds/replaces the NumberInt bits with just the contents
fixed_data = re.sub(r"NumberInt\((\d+)\)", r"\1", data)
loaded_data = json.loads(fixed_data)
print(json.dumps(loaded_data, indent=4))

How to get an array of first elements from a json array

I have a config.json file, which contains an array of organisations:
config.json
{
"organisations": [
{ "displayName" : "org1", "bucketName" : "org1_bucket" },
{ "displayName" : "org2", "bucketName" : "org2_bucket" },
{ "displayName" : "org3", "bucketName" : "org3_bucket" }
]
}
How can I get an array of all organisation names?
This is what I have tried:
from python_json_config import ConfigBuilder
def read_config():
builder = ConfigBuilder()
org_array = builder.parse_config('config.json')
# return all firstNames in org_array
import json
def read_config():
display_names = []
with open('yourfilename.json', 'r', encoding="utf-8") as file:
orgs = json.load(file)
display_names = [ o["displayName"] for o in orgs["organizations"] ]
return display_names
Also, we don't have any way to know what happens with ConfigBuilder or builder.parse_config since we don't have access to that code, so sorry to not take into account your example
a = {
"organisations": [
{ "displayName" : "org1", "bucketName" : "org1_bucket" },
{ "displayName" : "org2", "bucketName" : "org2_bucket" },
{ "displayName" : "org3", "bucketName" : "org3_bucket" }
]
}
print([i["displayName"] for i in a["organisations"]])
Output:
['org1', 'org2', 'org3']
Use list comprehension, it's very easy. In order to read a json file.
import json
data = json.load(open("config.json"))
Use lambda with map to get array of only organizations names
>>> list(map(lambda i:i['displayName'],x['organisations']))
>>> ['org1', 'org2', 'org3']
If you want to read json data from file into dictionary you can achieve this as following.
import json
with open('config.json') as json_file:
data = json.load(json_file)
org_array = list(map(lambda i:i['displayName'],data['organisations']))

How to create a filename with the current date and time in python when query is ran

When I run my query below, it creates a file called ‘mycsvfile’. However is there a way to add the current date and timestamp when the CSV file is created? For example if I run this query now the file should be named mycsvfile20171012 – 10:00:00 (something like that).
Could someone edit my code and show me how to do this please?
My code:
from elasticsearch import Elasticsearch
import csv
es = Elasticsearch(["9200"])
# Replace the following Query with your own Elastic Search Query
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST1"}}
]
}
}
}, size=10)
header_names = { 'DTDT': 'DATE', 'TRDT': 'TIME', ...}
with open('mycsvfile.csv', 'w') as f: # Just use 'w' mode in 3.x
header_present = False
for doc in res['hits']['hits']:
my_dict = doc['_source']
if not header_present:
w = csv.DictWriter(f, my_dict.keys())
w.writerow(header_names) # will write DATE, TIME, ... in correct place
header_present = True
w.writerow(my_dict)
Thank you in advance!
It is better to use underscore in filename than any other special character since it widely accepted
Therefore constructing file name as below :
csv_file = 'myfile_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
Use datetime as below :
from elasticsearch import Elasticsearch
import csv
es = Elasticsearch(["9200"])
# Replace the following Query with your own Elastic Search Query
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST1"}}
]
}
}
}, size=10)
from datetime import datetime
import os
file_path = <PASS YOUR FILE HERE>
csv_file = 'myfile_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
csv_file_full = os.path.join(file_path, os.sep, csv_file)
header_names = { 'DTDT': 'DATE', 'TRDT': 'TIME', ...}
with open(csv_file_full, 'w') as f: # Just use 'w' mode in 3.x
header_present = False
for doc in res['hits']['hits']:
my_dict = doc['_source']
if not header_present:
w = csv.DictWriter(f, my_dict.keys())
w.writerow(header_names) # will write DATE, TIME, ... in correct place
header_present = True
w.writerow(my_dict)
Yes, you can do like this:
However ":" is not supported in filenames so 20171010–10.00.00
>>> import time
>>> fname = lambda : "mycsvfile{}.csv".format(time.strftime("%Y%m%d-%H.%M.%S"))
>>>
>>> fname()
'mycsvfile20171012-17.24.59.csv'
>>> with open(fname()) as f:
>>> pass
Have a variable for file name as file_name and use datetime.now()
from datetime import datetime
file_name = 'mycsvfile' + str(datetime.now()) + '.csv'

Parse JSON array in Python

I have some JSON file:
{
"cis" : [ {
"ucmdbId" : "835cfedfaabc32a1358b322ff3bae056",
"type" : "running_software",
"properties" : {
"display_label" : "jboss (site1.ru)"
}
}, {
"ucmdbId" : "7ef9f21c132c12b3d8d2af0964cc5970",
"type" : "node",
"properties" : {
"display_label" : "site2.ru"
}
} ],
"relations" : [ {
"ucmdbId" : "80c42edbe32fbb4c25621756ec9e09d2",
"type" : "compound_f",
"properties" : null,
"end1Id" : "23e30baf2320a3274d0aa1e7f56cdaef",
"end2Id" : "15af0ba134327d32a0c5c72450e63fcd"
}, {
"ucmdbId" : "7fe9fb15d4462d1212aeee4aef2f32b4",
"type" : "compound_f",
"properties" : null,
"end1Id" : "23e30baf2320a3274d0aa327f56cdaef",
"end2Id" : "9232dd2621b814da632932e8cd33ffc8"
} ]
}
I only need the cis array. So this is what I need to parse:
[{
"ucmdbId" : "835cfedfaabc32a1358b322ff3bae056",
"type" : "running_software",
"display_label" : "jboss (site1.ru)"
}, {
"ucmdbId" : "7ef9f21c132c12b3d8d2af0964cc5970",
"type" : "node",
"display_label" : "site2.ru"
}]
Python script:
#!/usr/bin/python
import sys
import os
import tablib
import pandas as pd
import json
from pandas.io.json import json_normalize
f = open('/home/nik/test.json', 'rw')
jsonArray = f.read()
f.close
data = json.dumps(json.loads(jsonArray)['cis'])
jsonResult = pd.read_json(data)
array = json.loads(jsonArray)
print jsonArray
jsonResult.to_excel('/home/nik/output.xlsx', sheet_name='Sheet1')
But how can I get key parameters? I try to use:
print data['type'].keys()
print data['type']
But it gives me error:
AttributeError: 'str' object has no attribute 'keys'
How can I get the proper JSON format?
Update. Solution:
Thanks, it works. My complete code to export JSON into xlsx file:
#!/usr/bin/python
import subprocess
import sys
import os
import tablib
import pandas as pd
import json
import glob
import string
path = '/home/nik/json'
for jsonfile in glob.glob(os.path.join(path, '*.json')):
#jsonfile = '/home/nik/test.json'
with open(jsonfile) as data_file:
data = json.load(data_file)
JSON = '[{ \n'
for index, item in enumerate(data['cis']):
ucmdbId = (item['ucmdbId'])
type = (item['type'])
display_label = (item['properties']['display_label'])
Text1 = ' \"ucmdbId\" : \"%s\",' %(ucmdbId)
Text2 = ' \"type\" : \"%s\",' %(type)
Text3 = ' \"display_label\" : \"%s\",' %(display_label)
if index==(len(data['cis'])-1):
End = '}]'
else:
End = '}, {'
JSON += Text3+'\n'+Text2+'\n'+Text1+'\n'+End+'\n'
JSON = JSON.translate({ord(c): None for c in '\/'})
jsonResult = pd.read_json(JSON)
jsonResult = jsonResult.sort_values(by='type')
jsonResult.to_excel(jsonfile+'.xlsx', sheet_name='Object monitoring', index=False)
import json
from pprint import pprint
jsonfile = 'C:\\temp\\temp.json' # path to your json file
with open(jsonfile) as data_file:
data = json.load(data_file)
pprint(data['cis'])
The above will give you just the cis array.
Below is a more granular output
for item in data['cis']:
ucmdbId = (item['ucmdbId'])
type = (item['type'])
display_label = (item['properties']['display_label'])
print(ucmdbId)
print(type)
print(display_label)
If you want it with key labels then use
for item in data['cis']:
ucmdbId = (item['ucmdbId'])
type = (item['type'])
display_label = (item['properties']['display_label'])
print('ucmdbId:{}'.format(ucmdbId))
print('type:{}'.format(type))
print('display_label:{}'.format(display_label))

Categories

Resources