I am trying parse multiple json files in python from a folder and save them to a single csv.
This is my 'json' file format:
{
"width": 4032,
"height": 3024,
"ispano": false,
"objects": [
{
"key": "vERA48mAToOV36JrGge-8w",
"label": "regulatory--no-heavy-goods-vehicles--g2",
"bbox": {
"xmin": 1702.96875,
"ymin": 812.84765625,
"xmax": 2181.375,
"ymax": 1304.54296875
},
"properties": {
"barrier": false,
"occluded": false,
"out-of-frame": false,
"exterior": false,
"ambiguous": false,
"included": false,
"direction-or-information": false,
"highway": false,
"dummy": false
}
},
{
"key": "MXdgK-YrQrSrATvLYkJ7kQ",
"label": "information--dead-end--g1",
"bbox": {
"xmin": 1283.625,
"ymin": 488.7421875,
"xmax": 1739.390625,
"ymax": 1050.57421875
},
"properties": {
"barrier": false,
"occluded": false,
"out-of-frame": false,
"exterior": false,
"ambiguous": false,
"included": false,
"direction-or-information": false,
"highway": false,
"dummy": false
}
}
]
}
I don't need all information so I went through all sub dictionary. This is how I extracted data in python:
import pandas as pd
import glob
import json
from datetime import datetime
import csv
data = []
root = glob.glob("./labels/*.json")
for single_file in root:
with open(single_file, "r") as f:
json_file = json.load(f)
I iterate sub dict like this append in a list:
for sub_list in json_file["objects"]:
print (sub_info)
lst = []
count = 0
for key, val in sub_list.items():
#print(val)
lst.append([
sub_child["key"],
sub_child["label"],
sub_child["bbox"]["xmin"],
sub_child["bbox"]["ymin"],
sub_child["bbox"]["xmax"],
sub_child["bbox"]["ymax"]
])
#print(lst)
# Add headers
lst.insert(0, ["key","label","xmin","ymin","xmax","ymax"])
dir = "./"
with open(os.path.join(dir,"test.csv"),"w", newline="") as d:
writer = csv.writer(d)
#writer.writerow(lst)
writer.writerows(lst)
count += 1
print('updated csv')
It saves a csv file named 'test.csv' but only with the information of last row not from all json file.
I want to save csv which includes mentioned information from all json files.
I want csv like this
| file_name | key | label | xmin | ymin | xmax | ymax |
It includes corresponding file_name, key, labels, xmin, ymin. xmax, ymax.
Could you please help me to solve my problem?
You can just write each row to the file as you iterate over the objects:
import glob
import json
import csv
with open('test.csv', 'w', newline='') as f_csv:
csv_output = csv.writer(f_csv)
csv_output.writerow(["file_name", "key", "label", "xmin", "ymin", "xmax", "ymax"])
for single_file in glob.glob("*.json"):
print(single_file)
with open(single_file) as f_json:
json_data = json.load(f_json)
for object in json_data["objects"]:
csv_output.writerow([
single_file,
object["key"],
object["label"],
object["bbox"]["xmin"],
object["bbox"]["ymin"],
object["bbox"]["xmax"],
object["bbox"]["ymax"]
])
Giving you test.txt as follows:
file_name,key,label,xmin,ymin,xmax,ymax
test1.json,vERA48mAToOV36JrGge-8w,regulatory--no-heavy-goods-vehicles--g2,1702.96875,812.84765625,2181.375,1304.54296875
test1.json,MXdgK-YrQrSrATvLYkJ7kQ,information--dead-end--g1,1283.625,488.7421875,1739.390625,1050.57421875
test2.json,vERA48mAToOV36JrGge-8w,regulatory--no-heavy-goods-vehicles--g3,1702.96875,812.84765625,2181.375,1304.54296875
test2.json,MXdgK-YrQrSrATvLYkJ7kQ,information--dead-end--g1,1283.625,488.7421875,1739.390625,1050.57421875
Related
I have many json files under /json/reports/ location and for each json file the output need to be converted into csv file individually.
I have the following python code to convert.
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
jsonData = pd.read_json(f.read())
jsonData.to_csv(f.name+".csv")
f.close()
One of my json file (First few lines) output as follows.
[
{
"api_result": "KEY_NAME",
"ml_result": "VALUE",
"line_data_list": [
{
"line": "54A2FF607A6dsewroadeEOERD> |-",
"line_num": 9053,
"path": "/home/user/src/common/race/flow/prog_flow.mk",
"value": "WOERJFOQDKSDFKKASDF0",
"variable": null,
"entropy_validation": true
}
],
"ml_part": 0.994396984577179,
"rule": "GCP Client ID",
"severity": "high"
},
{
"api_result": "NOT_AVAILABLE",
"ml_result": "NOT_AVAILABLE",
"line_data_list": [
{
"line": "-----BEGIN Result-----",
"line_num": 19873,
"path": "/home/user/test/linux/ops/format.key",
"value": "-----BEGIN RSA PRIVATE",
"variable": null,
"entropy_validation": false
}
],
"ml_part": null,
"rule": "Certificate",
"severity": "low"
},
.....
.......
..........
Problem:-:-
The above python code writing line_data_list list values (line, line_num, path, value, variable, & entropy_validation) in single column, but I need each value in a seprate column. (Ie specified in below format).
Expected output csv per json file:-
Sl.no
api_result
ml_result
line_data_list
line
line_num
path
value
variable
entropy_validation
ml_part
rule
severity
1
KEY_NAME
VALUE
54A2FF607A6dsewroadeEOERD
9053
/home/user98/src/common/race/flow/prog_flow.mk
WOERJFOQDKSDFKKASDFO
null
TRUE
0.994396985
GCP Client ID
high
2
NOT_AVAILABLE
NOT_AVAILABLE
-----BEGIN Result-----
19873
/home/user/test/linux/ops/format.key
-----BEGIN RSA PRIVATE
null
false
null
Certificate
low
3
Need help to print each in separate column.
I have this json file:
df = pd.read_json("mydata.json")
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
pd.concat([df, t], axis=1)
In your case:
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
df = pd.read_json(file)
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
df = pd.concat([df, t], axis=1)
df.to_csv(f'{file[:-5]}.csv')
Output:
You need to unpack your line_data_list key-value pairs so they occur on the same level as your other columns. Something like what I've written below would work.
import pandas as pd
import glob
import json
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
json_dict = json.load(f)
line_data = json_dict[0].pop("line_data_list")
{json_dict.update(header, val) for header, val in line_data.items()}
jsonData = pd.from_dict(json_dict, orient="records")
jsonData.to_csv(f.name+".csv")
f.close()
I am reading data from a JSON file to check the existence of some values.
In the JSON structure below, I try to find adomain from the data in bid and check if there is a cat value, which is not always present.
How do I fix it in the syntax below?
import pandas as pd
import json
path = 'C:/MyWorks/Python/Anal/data_sample.json'
records = [json.loads(line) for line in open(path, encoding='utf-8')]
adomain = [
rec['win_res']['seatbid'][0]['bid'][0]['adomain']
for rec in records
if 'adomain' in rec
]
Here is a data sample:
[
{ "win_res": {
"id": "12345",
"seatbid": [
{
"bid": [
{
"id": "12345",
"impid": "1",
"price": 0.1,
"adm": "",
"adomain": [
"adomain.com"
],
"iurl": "url.com",
"cid": "11",
"crid": "11",
"cat": [
"IAB12345"
],
"w": 1,
"h": 1
}
],
"seat": "1"
}
]
}}
]
As a result, the adomain value exists unconditionally, but the cat value may not be present sometimes.
So, if cat exists in adomain, I want to express adomain and cat in this way, but if there is no adomain, the cat value, how can I do it?
Your question is not clear but I think this is what you are looking for:
import json
path = 'C:/MyWorks/Python/Anal/data_sample.json'
with open(path, encoding='utf-8') as f:
records = json.load(f)
adomain = [
_['win_res']['seatbid'][0]['bid'][0]['adomain']
for _ in records
if _['win_res']['seatbid'][0]['bid'][0].get('adomain', None) and
_['win_res']['seatbid'][0]['bid'][0].get('cat', None)
]
The code above will add the value of ['win_res']['seatbid'][0]['bid'][0]['adomain'] to the list adomain only if there is a ['win_res']['seatbid'][0]['bid'][0]['cat'] corresponding value.
The code will be a lot clearer if we just walk through a bids list. Something like this:
import json
path = 'C:/MyWorks/Python/Anal/data_sample.json'
with open(path, encoding='utf-8') as f:
records = json.load(f)
bids = [_['win_res']['seatbid'][0]['bid'][0] for _ in records]
adomain = [
_['adomain']
for _ in bids
if _.get('adomain', None) and _.get('cat', None)
]
I'm converting several JSON files into a CSV using the following code below, it works as intended, but it converts all of the data in the JSON file. Instead, I want it to do the following:
Load JSON file [done]
Extract certain nested data in the JSON file [wip]
Convert to CSV [done]
Current Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
dic = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Flatten and convert to a data frame
dic_flattened = (flatten(d, '.') for d in dic)
df = pandas.DataFrame(dic_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
In the example at the bottom, I only want everything under the following keys: created, emails, and identities. The rest is useless information (such as statusCode) or it's duplicated under a different key name (such as profile and userInfo).
I know it requires a for loop and if statement to specify the key names later on, but not sure the best way to implement it. This is what I have so far when I want to test it:
Attempted Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_file = open(file_path + '.json', 'r', encoding='utf-8', errors='ignore')
dic = json.load(json_file)
# List keys to extract
key_list = ['created', 'emails', 'identities']
for d in dic:
#print(d['identities']) #Print all 'identities'
#if 'identities' in d: #Check if 'identities' exists
if key_list in d:
# Flatten and convert to a data frame
#dic_flattened = (flatten(d, '.') for d in dic)
#df = pandas.DataFrame(dic_flattened)
else:
# Skip
# Export to CSV in the same directory with the original file name
#export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
Is this the right logic?
file_name.json Example
[
{
"callId": "abc123",
"errorCode": 0,
"apiVersion": 2,
"statusCode": 200,
"statusReason": "OK",
"time": "2020-12-14T12:00:32.744Z",
"registeredTimestamp": 1417731582000,
"UID": "_guid_abc123==",
"created": "2014-12-04T22:19:42.894Z",
"createdTimestamp": 1417731582000,
"data": {},
"preferences": {},
"emails": {
"verified": [],
"unverified": []
},
"identities": [
{
"provider": "facebook",
"providerUID": "123",
"allowsLogin": true,
"isLoginIdentity": true,
"isExpiredSession": true,
"lastUpdated": "2014-12-04T22:26:37.002Z",
"lastUpdatedTimestamp": 1417731997002,
"oldestDataUpdated": "2014-12-04T22:26:37.002Z",
"oldestDataUpdatedTimestamp": 1417731997002,
"firstName": "John",
"lastName": "Doe",
"nickname": "John Doe",
"profileURL": "https://www.facebook.com/John.Doe",
"age": 30,
"birthDay": 31,
"birthMonth": 12,
"birthYear": 1969,
"city": "City, State",
"education": [
{
"school": "High School Name",
"schoolType": "High School",
"degree": null,
"startYear": 0,
"fieldOfStudy": null,
"endYear": 0
}
],
"educationLevel": "High School",
"followersCount": 0,
"gender": "m",
"hometown": "City, State",
"languages": "English",
"locale": "en_US",
"name": "John Doe",
"photoURL": "https://graph.facebook.com/123/picture?type=large",
"timezone": "-8",
"thumbnailURL": "https://graph.facebook.com/123/picture?type=square",
"username": "john.doe",
"verified": "true",
"work": [
{
"companyID": null,
"isCurrent": null,
"endDate": null,
"company": "Company Name",
"industry": null,
"title": "Company Title",
"companySize": null,
"startDate": "2010-12-31T00:00:00"
}
]
}
],
"isActive": true,
"isLockedOut": false,
"isRegistered": true,
"isVerified": false,
"lastLogin": "2014-12-04T22:26:33.002Z",
"lastLoginTimestamp": 1417731993000,
"lastUpdated": "2014-12-04T22:19:42.769Z",
"lastUpdatedTimestamp": 1417731582769,
"loginProvider": "facebook",
"loginIDs": {
"emails": [],
"unverifiedEmails": []
},
"rbaPolicy": {
"riskPolicyLocked": false
},
"oldestDataUpdated": "2014-12-04T22:19:42.894Z",
"oldestDataUpdatedTimestamp": 1417731582894
"registered": "2014-12-04T22:19:42.956Z",
"regSource": "",
"socialProviders": "facebook"
}
]
As mentioned by juanpa.arrivillaga, I simply need to add the following line after the key_list:
json_list = [{k:d[k] for k in key_list} for d in json_list]
This is the full working code:
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_list = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Extract data from the defined key names
key_list = ['created', 'emails', 'identities']
json_list = [{k:d[k] for k in key_list} for d in json_list]
# Flatten and convert to a data frame
json_list_flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
I have written a code to convert csv file to nested json format. I have multiple columns to be nested hence assigning separately for each column. The problem is I'm getting 2 fields for the same column in the json output.
import csv
import json
from collections import OrderedDict
csv_file = 'data.csv'
json_file = csv_file + '.json'
def main(input_file):
csv_rows = []
with open(input_file, 'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
row['TYPE'] = 'REVIEW', # adding new key, value
row['RAWID'] = 1,
row['CUSTOMER'] = {
"ID": row['CUSTOMER_ID'],
"NAME": row['CUSTOMER_NAME']
}
row['CATEGORY'] = {
"ID": row['CATEGORY_ID'],
"NAME": row['CATEGORY']
}
del (row["CUSTOMER_NAME"], row["CATEGORY_ID"],
row["CATEGORY"], row["CUSTOMER_ID"]) # deleting since fields coccuring twice
csv_rows.append(row)
with open(json_file, 'w') as f:
json.dump(csv_rows, f, sort_keys=True, indent=4, ensure_ascii=False)
f.write('\n')
The output is as below:
[
{
"CATEGORY": {
"ID": "1",
"NAME": "Consumers"
},
"CATEGORY_ID": "1",
"CUSTOMER_ID": "41",
"CUSTOMER": {
"ID": "41",
"NAME": "SA Port"
},
"CUSTOMER_NAME": "SA Port",
"RAWID": [
1
]
}
]
I'm getting 2 entries for the fields I have assigned using row[''].
Is there any other way to get rid of this? I want only one entry for a particular field in each record.
Also how can I convert the keys to lower case after reading from csv.DictReader(). In my csv file all the columns are in upper case and hence I'm using the same to assign. But I want to convert all of them to lower case.
In order to convert the keys to lower case, it would be simpler to generate a new dict per row. BTW, it should be enough to get rid of the duplicate fields:
for row in reader:
orow = collection.OrderedDict()
orow['type'] = 'REVIEW', # adding new key, value
orow['rawid'] = 1,
orow['customer'] = {
"id": row['CUSTOMER_ID'],
"name": row['CUSTOMER_NAME']
}
orow['category'] = {
"id": row['CATEGORY_ID'],
"name": row['CATEGORY']
}
csv_rows.append(orow)
The below query is grabbing data and creating a CSV file, the issue that I am having is that the source called ‘SPLE’ stores data in the database with numbers of 0, 1, 50.
However in the CSV those numbers are being collected in the CSV and I would like somehow when creating the CSV those number to represent words such as,
0 = True
1 = False
50 = Pending
Could someone show me how this is done please, I have been struggling on this?
My Code:
from elasticsearch import Elasticsearch
import csv
es = Elasticsearch(["9200"])
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST*"}}
]
}
}
}, size=10)
header_names = { 'DTDT': 'DATE', 'SPLE': 'TAG', ...}
with open('mycsvfile.csv', 'w') as f:
header_present = False
for doc in res['hits']['hits']:
my_dict = doc['_source']
if not header_present:
w = csv.DictWriter(f, my_dict.keys())
w.writerow(header_names)
header_present = True
w.writerow(my_dict)
The output in the CSV file is:
Date SPLE Venue
20171016 1 Central
20171016 0 Central
20171016 50 Central
I'm assuming mycsvfile.csv file have SPLE column.
from elasticsearch import Elasticsearch
es = Elasticsearch(["9200"])
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST*"}}
]
}
}
}, size=10)
import pandas as pd
SPLE = {0:'true',1:'false',50:'pending'}
saved_csv = pd.read_csv('mycsvfile.csv',sep='\t')
saved_csv['SPLE'] = saved_csv['SPLE'].map(lambda x: SPLE[int(x)])
saved_csv.to_csv('edited_csv.csv', index=False)
Declare a dict somewhere for doing the translation:
SPLE_TRANSLATION = {0: 'True', 1: 'False', 50: 'Pending'}
Then, inside your loop:
my_dict['SPLE'] = SPLE_TRANSLATION[my_dict['SPLE']]
w.writerow(my_dict)