json data into individual csv file - python

I have many json files under /json/reports/ location and for each json file the output need to be converted into csv file individually.
I have the following python code to convert.
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
jsonData = pd.read_json(f.read())
jsonData.to_csv(f.name+".csv")
f.close()
One of my json file (First few lines) output as follows.
[
{
"api_result": "KEY_NAME",
"ml_result": "VALUE",
"line_data_list": [
{
"line": "54A2FF607A6dsewroadeEOERD> |-",
"line_num": 9053,
"path": "/home/user/src/common/race/flow/prog_flow.mk",
"value": "WOERJFOQDKSDFKKASDF0",
"variable": null,
"entropy_validation": true
}
],
"ml_part": 0.994396984577179,
"rule": "GCP Client ID",
"severity": "high"
},
{
"api_result": "NOT_AVAILABLE",
"ml_result": "NOT_AVAILABLE",
"line_data_list": [
{
"line": "-----BEGIN Result-----",
"line_num": 19873,
"path": "/home/user/test/linux/ops/format.key",
"value": "-----BEGIN RSA PRIVATE",
"variable": null,
"entropy_validation": false
}
],
"ml_part": null,
"rule": "Certificate",
"severity": "low"
},
.....
.......
..........
Problem:-:-
The above python code writing line_data_list list values (line, line_num, path, value, variable, & entropy_validation) in single column, but I need each value in a seprate column. (Ie specified in below format).
Expected output csv per json file:-
Sl.no
api_result
ml_result
line_data_list
line
line_num
path
value
variable
entropy_validation
ml_part
rule
severity
1
KEY_NAME
VALUE
54A2FF607A6dsewroadeEOERD
9053
/home/user98/src/common/race/flow/prog_flow.mk
WOERJFOQDKSDFKKASDFO
null
TRUE
0.994396985
GCP Client ID
high
2
NOT_AVAILABLE
NOT_AVAILABLE
-----BEGIN Result-----
19873
/home/user/test/linux/ops/format.key
-----BEGIN RSA PRIVATE
null
false
null
Certificate
low
3
Need help to print each in separate column.

I have this json file:
df = pd.read_json("mydata.json")
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
pd.concat([df, t], axis=1)
In your case:
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
df = pd.read_json(file)
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
df = pd.concat([df, t], axis=1)
df.to_csv(f'{file[:-5]}.csv')
Output:

You need to unpack your line_data_list key-value pairs so they occur on the same level as your other columns. Something like what I've written below would work.
import pandas as pd
import glob
import json
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
json_dict = json.load(f)
line_data = json_dict[0].pop("line_data_list")
{json_dict.update(header, val) for header, val in line_data.items()}
jsonData = pd.from_dict(json_dict, orient="records")
jsonData.to_csv(f.name+".csv")
f.close()

Related

convert a CSV file to JSON file

I am trying to convert CSV file to JSON file based on a column value. The csv file looks somewhat like this.
ID Name Age
CSE001 John 18
CSE002 Marie 20
ECE001 Josh 22
ECE002 Peter 23
currently I am using the following code to obtain json file.
import csv
import json
def csv_to_json(csv_file_path, json_file_path):
data_dict = {}
with open(csv_file_path, encoding = 'utf-8') as csv_file_handler:
csv_reader = csv.DictReader(csv_file_handler)
for rows in csv_reader:
key = rows['ID']
data_dict[key] = rows
with open(json_file_path, 'w', encoding = 'utf-8') as json_file_handler:
json_file_handler.write(json.dumps(data_dict, indent = 4))
OUTPUT:
**{
"CSE001":{
"ID":"CSE001",
"Name":"John",
"Age":18
}
"CSE002":{
"ID":"CSE002",
"Name":"Marie",
"Age":20
}
"ECE001":{
"ID":"ECE001",
"Name":"Josh",
"Age":22
}
"ECE002":{
"ID":"ECE002",
"Name":"Peter",
"Age":23
}
}**
I want my output to generate two separate json files for CSE and ECE based on the ID value. Is there a way to achieve this output.
Required Output:
CSE.json:
{
"CSE001":{
"ID":"CSE001",
"Name":"John",
"Age":18
}
"CSE002":{
"ID":"CSE002",
"Name":"Marie",
"Age":20
}
}
ECE.json:
{
"ECE001":{
"ID":"ECE001",
"Name":"Josh",
"Age":22
}
"ECE002":{
"ID":"ECE002",
"Name":"Peter",
"Age":23
}
}
I would suggest you to use pandas, that way will be more easier.
Code may look like:
import pandas as pd
def csv_to_json(csv_file_path):
df = pd.read_csv(csv_file_path)
df_CSE = df[df['ID'].str.contains('CSE')]
df_ECE = df[df['ID'].str.contains('ECE')]
df_CSE.to_json('CSE.json')
df_ECE.to_json('ESE.json')
You can create dataframe and then do the following operation
import pandas as pd
df = pd.DataFrame.from_dict({
"CSE001":{
"ID":"CSE001",
"Name":"John",
"Age":18
},
"CSE002":{
"ID":"CSE002",
"Name":"Marie",
"Age":20
},
"ECE001":{
"ID":"ECE001",
"Name":"Josh",
"Age":22
},
"ECE002":{
"ID":"ECE002",
"Name":"Peter",
"Age":23
}
},orient='index')
df["id_"] = df["ID"].str[0:2] # temp column for storing first two chars
grps = df.groupby("id_")[["ID", "Name", "Age"]]
for k, v in grps:
print(v.to_json(orient="index")) # you can create json file as well
You could store each row into two level dictionary with the top level being the first 3 characters of the ID.
These could then be written out into separate files with the key being part of the filename:
from collections import defaultdict
import csv
import json
def csv_to_json(csv_file_path, json_base_path):
data_dict = defaultdict(dict)
with open(csv_file_path, encoding = 'utf-8') as csv_file_handler:
csv_reader = csv.DictReader(csv_file_handler)
for row in csv_reader:
key = row['ID'][:3]
data_dict[key][row['ID']] = row
for key, values in data_dict.items():
with open(f'{json_base_path}_{key}.json', 'w', encoding='utf-8') as json_file_handler:
json_file_handler.write(json.dumps(values, indent = 4))
csv_to_json('input.csv', 'output')
The defaultdict is used to avoid needing to first test if a key is already present before using it.
This would create output_CSE.json and output_ECE.json, e.g.
{
"ECE001": {
"ID": "ECE001",
"Name": "Josh",
"Age": "22"
},
"ECE002": {
"ID": "ECE002",
"Name": "Peter",
"Age": "23"
}
}

CSV to JSON of Lists

Currently, I have a CSV file with the following example --
File
skill
experience
overall_experience
1
Java
1.5
3
1
Python
1.0
3
1
SQL
0.5
3
There are multiple entries for many such files but I need to merge the skills and their respective experience into a single value belonging to a single key, something like this -
{
"1": {
"file": "1",
"skill": ["Java", "Python", "SQL"],
"experience": [1.5, 1.0, 0.5]
"Overall_exp": 3.0
},}
I tried a Python Code for this but it is giving me only the value of last skill and last experience (and not the whole thing in a list)
Here is the code I was using --
import csv
import json
# Function to convert a CSV to JSON
# Takes the file paths as arguments
def make_json(csvFilePath, jsonFilePath):
# create a dictionary
data = {}
# Open a csv reader called DictReader
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
# Convert each row into a dictionary
# and add it to data
for rows in csvReader:
# Assuming a column named 'file' to
# be the primary key
key = rows['file']
data[key] = rows
# Open a json writer, and use the json.dumps()
# function to dump data
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
# Decide the two file paths according to your
# computer system
csvFilePath = 'skill_matrix.csv'
jsonFilePath = 'skill_matrix.json'
# Call the make_json function
make_json(csvFilePath, jsonFilePath)
The output that I get here is this --
{
"1": {
"file": "1",
"skill": "SQL",
"experience": "0.5"
"Overall_exp": "3.0"
},}
How can I convert it to the former json format and not the latter?
You can use pandas to read your csv, group by File and export to json:
df = pd.read_csv(your_csv)
df = df.groupby('File', as_index=False).agg({'skill': list, 'experience': list, 'overall_experience': np.mean})
print(df.to_json(orient='index', indent=4))
Note: you can specify the aggregation functions for your columns in a dictionary
Output:
{
"0":{
"File":1,
"skill":[
"Java",
"Python",
"SQL"
],
"experience":[
1.5,
1.0,
0.5
],
"overall_experience":3.0
}
}
I think that loading into Pandas first and then going from all data to the narrowing strategy is cleaner and easier. You can use the following code for parsing your data into JSON files;
import pandas as pd
import json
# Load the CSV into Pandas
df = pd.read_csv('1.csv', header=0)
data = df.to_dict(orient='list')
# Delete / change as you wish
data['File'] = str(data['File'][0])
data['overall_experience'] = data['overall_experience'][0]
# Save as json
with open('1.json', 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
Result (1.json)
{
"File": "1",
"skill": [
"Java",
"Python",
"SQL"
],
"experience": [
1.5,
1.0,
0.5
],
"overall_experience": 3
}
I suppose hat you have multiple file id in a CSV file. Your given example is too minimalistic. Anyhow, then you can create a master dictionary and add your smaller ones as follows;
import pandas as pd
import json
# Load the CSV into Pandas
df = pd.read_csv('1.csv', header=0)
# Master dictionary
master_dict = {}
for idx, file_id in enumerate(df["File"].unique()):
data = df[df['File'] == file_id].to_dict(orient='list')
# Delete / change as you wish
data['File'] = str(data['File'][0])
data['overall_experience'] = data['overall_experience'][0]
master_dict[idx] = data
# Save as json
with open('1.json', 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(master_dict, indent=4))
Result (1.json)
{
"0": {
"File": "1",
"skill": [
"Java",
"Python",
"SQL"
],
"experience": [
1.5,
1.0,
0.5
],
"overall_experience": 3
},
"1": {
"File": "2",
"skill": [
"Java",
"Python"
],
"experience": [
2.0,
2.5
],
"overall_experience": 1
}
}
If you don't want to use Pandas, you could try:
import csv
import json
def make_json(csvfile_path, jsonfile_path):
data = {}
with open(csvfile_path, "r") as csvfile:
next(csvfile) # Skip header line
for row in csv.reader(csvfile):
fdata = data.setdefault(row[0], {"file": row[0]})
fdata.setdefault("skill", []).append(row[1])
fdata.setdefault("experience", []).append(float(row[2]))
fdata.setdefault("overall_experience", []).append(float(row[3]))
with open(jsonfile_path, "w") as jsonfile:
json.dump(data, jsonfile)
The main difference to your approach is the explicit structuring of the inner dicts: values are lists (except for the 'file' key). The dict.setdefault() is great here: You can set a value for a key if it isn't in the dict, and get the value back (either the newly set one or the existing). So you can put a list in the dict, get it back, and can immediately .append() to it.
If you want to use a csv.DictReader:
def make_json(csvfile_path, jsonfile_path):
data = {}
with open(csvfile_path, "r") as csvfile:
for row in csv.DictReader(csvfile):
fdata = data.setdefault(row["file"], {"file": row["file"]})
for key, value in list(row.items())[1:]:
fdata.setdefault(key, []).append(
value if key == "skill" else float(value)
)
with open(jsonfile_path, "w") as jsonfile:
json.dump(data, jsonfile)
(I haven't, since I wasn't sure about the actual column names.)

why the python program is reading data just from one sub directories?

I have a program that does below:
There are multiple folders which contains JSON file called “installed-files.json”.
Program is suppose to read the JSON files from each of the sub-folders.
If the JSON files are there, then convert it into a xlsx format.
The xlsx format shoud have worksheets named as per the sub-folder name.
e.g. if the name of the sub folder is CNA, the sheet name shoud be CNA, etc.
Below is the code snippet
import pandas as pd
import json
import os
def traverse_dir(rootDir, file_name):
dir_names = []
for names in os.listdir(rootDir):
entry_path = os.path.join(rootDir, names)
if os.path.isdir(entry_path):
dir_names.append(entry_path)
for fil_name in dir_names:
file_path = os.path.join(fil_name, file_name)
print(file_path)
if os.path.isfile(file_path):
with open(file_path) as jf:
data = json.load(jf)
df = pd.DataFrame(data)
df1 = pd.DataFrame(data)
new_df = df[df.columns.difference(['SHA256'])]
new_df1 = df1[df.columns.difference(['SHA256'])]
with pd.ExcelWriter('abc.xlsx') as writer:
new_df.to_excel(writer, sheet_name='BRA', index=False)
new_df1.to_excel(writer, sheet_name='CNA', index=False)
else:
print("file not found")
rootDir = <Full_Path_To_Sub-dirs>
file_name = 'installed-files.json'
traverse_dir(rootDir, file_name)
Below is the sample JSON file content
[
{
"SHA256": "123456",
"Name": "/system/Home.apk",
"Size": 99250072
},
{
"SHA256": "987654",
"Name": "/system/Setup.apk",
"Size": 86578788
},
{
"SHA256": "457457",
"Name": "/system/SApp.apk",
"Size": 72207922
},
{
"SHA256": "747645",
"Name": "/system/Lib.apk",
"Size": 57960376
},
{
"SHA256": "368764",
"Name": "/system/mium.so",
"Size": 51161376
},
{
"SHA256": "34455",
"Name": "/system/Smart.apk",
"Size": 50944780
},
{
"SHA256": "66777",
"Name": "/system/framework/work.jar",
"Size": 24772514
},
]
Problem Statement:
While the excel sheet is getting created as per the sub folders name(BRA and CNA). But the data is only coming from CNA. I can confirm this, because the JSON file present in both the sub directories had the same data initially. Therefore, to test my use cases I modified the content of BRA first. But after executing the code those changes were not present the new excel file for any of the two tabs that got created. Hence, I modified the JSON file from the CNA sub-folder. Now, when I execute the program, I could see those modified data in both tabs in the excel file .
Any ideas, why that could be happening?
I have also attached
- project directory structure screenshot.
Your problem is that you are writting a excell everytime you found a file and the data you are reading to both data frames is the same because you are getting it from the same JSON file. Also you must check the new_df1 = df1[df.columns.difference(['SHA256'])] because you are using the df and the df1, I'm not sure if this is what you wanted.
Either way, here is a working code snippet:
import pandas as pd
import json
import os
def traverse_dir(root: str, file_name: str):
data_cna = None
data_bra = None
for dir in os.listdir(root):
dir_path = os.path.join(root, dir)
# Grabs only the directories
if not os.path.isdir(dir_path):
continue
for file in os.listdir(dir_path):
file_path = os.path.join(dir_path, file)
# Grabs only the files within the directories and with the name passed
if not os.path.isfile(file_path):
continue
if file != file_name:
continue
if dir == "CNA":
with open(file_path) as freader:
data_cna = json.load(freader)
elif dir == "BRA":
with open(file_path) as freader:
data_bra = json.load(freader)
else:
# Other directories names are ignored
continue
if data_cna is None:
raise ValueError(f"{file_name} not found in {os.path.join(root, 'CNA')}")
if data_bra is None:
raise ValueError(f"{file_name} not found in {os.path.join(root, 'BRA')}")
df_cna = pd.DataFrame(data_cna)[pd.DataFrame(data_cna).columns.difference(['SHA256'])]
# Shouldn't this be: df_bra = pd.DataFrame(data_bra)[pd.DataFrame(data_bra).columns.difference(['SHA256'])],
# I mean replace data_cna difference by data_bra. Check your code.
df_bra = pd.DataFrame(data_bra)[pd.DataFrame(data_cna).columns.difference(['SHA256'])]
with pd.ExcelWriter('abc.xlsx') as writer:
df_cna.to_excel(writer, sheet_name='CNA', index=False)
df_bra.to_excel(writer, sheet_name='BRA', index=False)
rootDir = "."
file_name = 'installed-files.json'
traverse_dir(rootDir, file_name)
CNA JSON:
[
{
"SHA256": "123456",
"Name": "/system/Home.apk",
"Size": 99250072
},
{
"SHA256": "987654",
"Name": "/system/Setup.apk",
"Size": 86578788
}
]
BRA JSON:
[
{
"SHA256": "66777",
"Name": "/system/framework/work.jar",
"Size": 24772514
}
]
xls output CNA page:
xls output BRA page:

Scrape specific Json data to a csv

I am trying to scrape some json data. The first few rows ae as follows and all the latter is in the same format.
Json data:
{
"data": [
{
"date": "2011-10-07",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-08",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-12",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-13",
"f(avg(output_total)/number(100000000))": 54.0515120216902
},.......]
I am willing scrape the date with the its relevant value (like fi=or the above, 2011-10-07 and 50, 2011-10-08 and 50 etc.) into a csv file which contains two columns (date and value)
How can I proceed this? is it possible with python?
This is how I grabbed the json data:
import os
import requests
url='https://api.blockchair.com/litecoin/transactions?a=date,f(avg(output_total)/number(100000000))'
proxies = {}
response = requests.get(url=url, proxies=proxies)
print(response.content)
pandas allows you to solve this one in a few lines:
import pandas as pd
df = pd.DataFrame(json_data['data'])
df.columns = ["date", "value"]
df.to_csv("data.csv", index=False)
json = {
"data": [
{
"date": "2011-10-07",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-08",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-12",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-13",
"f(avg(output_total)/number(100000000))": 54.0515120216902
}]}
Step 1: Convert json into a Pandas Dataframe
df = pd.DataFrame(json['data'])
Step 2: Filter Df based on conditions ( e.g >>> value = 50)
df_filtered = df[(df["f(avg(output_total)/number(100000000))"] == 50)]
Step 3: Save df into csv file and choose the location where you like to store the CSV file on your computer.
df_filtered.to_csv(r'C:\user\foo\output.csv', index = False)
if you wish to include the index, then simply remove index = False
You can do like this.
Iterate over the JSON string, extract the data you need and then write that data to CSV file.
import json
import csv
fields = ['Date', 'Value']
filename = 'test.csv'
s = """
{
"data":[
{
"date":"2011-10-07",
"f(avg(output_total)/number(100000000))":50
},
{
"date":"2011-10-08",
"f(avg(output_total)/number(100000000))":50
},
{
"date":"2011-10-12",
"f(avg(output_total)/number(100000000))":50
},
{
"date":"2011-10-13",
"f(avg(output_total)/number(100000000))":54.0515120216902
}
]
}
"""
x = json.loads(s)
with open(filename, 'w', newline='') as f:
cw = csv.writer(f)
cw.writerow(fields)
for i in x['data']:
cw.writerow(i.values())
test.csv
Date Value
07-10-11 50
08-10-11 50
12-10-11 50
13-10-11 54.05151202
If you just want a CSV file without relying on any additional Python modules (such as pandas) then it's very simple:
import requests
CSV = 'blockchair.csv'
url='https://api.blockchair.com/litecoin/transactions?a=date,f(avg(output_total)/number(100000000))'
with requests.Session() as session:
response = session.get(url)
response.raise_for_status()
with open(CSV, 'w') as csv:
csv.write('Date,Value\n')
for d in response.json()['data']:
for i, v in enumerate(d.values()):
if i > 0:
csv.write(',')
csv.write(str(v))
csv.write('\n')
You can try this:
import requests
import csv
import pandas as pd
url='https://api.blockchair.com/litecoin/transactions?a=date,f(avg(output_total)/number(100000000))'
csv_name = 'res_values_1.csv'
response = requests.get(url=url).json()
res_data = response.get('data', [])
# Solution using pandas
res_df = pd.DataFrame(res_data)
res_df.rename(columns={'f(avg(output_total)/number(100000000))': 'value'}, inplace=True)
# filter data those value in >= 50
filtered_res_df = res_df[(res_df["value"] >= 50)]
filtered_res_df.to_csv(csv_name, sep=',', encoding='utf-8', index = False)
# Solution using csv
csv_name = 'res_values_2.csv'
headers = ['date', 'value']
with open(csv_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
for data in res_data:
values = list(data.values())
if values[1] >= 50:
writer.writerow(values)
CSV Output:
date,value
2011-10-07,50.0
2011-10-08,50.0
2011-10-12,50.0
2011-10-13,54.0515120216902
.
.
.
2021-10-05,346.12752821011594
2021-10-06,293.5061907016782
2021-10-07,333.17665010641673
2021-10-08,332.2437737707938

How to retrieve nested values in json array recursively?

I have about 5k json files structured similarly. I need to get all the values for "loc" key from all the files and store it to a separate json file or two. The total values for "loc" key from all the files will count to 78 million. So how can I get this done and possibly in most optimized and fastest way.
Structure of content in all files looks like:
{
"urlset": {
"#xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
"#xmlns:xhtml": "http://www.w3.org/1999/xhtml",
"url": [
{
"loc": "https://www.example.com/a",
"xhtml:link": {
"#rel": "alternate",
"#href": "android-app://com.example/xyz"
},
"lastmod": "2020-12-25",
"priority": "0.8"
},
{
"loc": "https://www.exampe.com/b",
"xhtml:link": {
"#rel": "alternate",
"#href": "android-app://com.example/xyz"
},
"lastmod": "2020-12-25",
"priority": "0.8"
}
]
}
}
I am looking for output json file like:
["https://www.example.com/a","https://www.example.com/b"]
what I am current doing is:
path = r'/home/spark/' # path to folder containing files
link_list = [] # list of required links
li = "" # contains text of all files combined
all_files = glob.glob(path + "/*")
#Looping through each file
for i in range(0,len(all_files)):
filename = all_files[i]
with open(filename,"r") as f:
li = li + f.read()
#Retrieving link from every "loc" key
for k in range(0,7800000):
lk = ((li.split('"loc"',1)[1]).split('"',1)[1]).split(" ",1)[0]
link = lk.replace('",','')
link_list.append(link)
with open("output.json","w") as f:
f.write(json.dumps(link_list))
I guess this is the worst solution anyone can get :D, so I need to optimize it to do the job fast and efficiently.
import json
import glob
dict_results = {}
dict_results['links'] = []
for filename in glob.glob("*json"):
with open("data.json", "r") as msg:
data = json.load(msg)
for url in data['urlset']['url']:
dict_results['links'].append(url['loc'])
print (dict_results)
If you just want all links, that should make it. Just write to file in text or binary as you wish after.
Output:
{'links': ['https://www.example.com/a', 'https://www.exampe.com/b']}
In case you just want a list (and so not a json):
import json
import glob
list_results = []
for filename in glob.glob("*json"):
with open("data.json", "r") as msg:
data = json.load(msg)
for url in data['urlset']['url']:
list_results.append(url['loc'])
print (list_results)
Output:
['https://www.example.com/a', 'https://www.exampe.com/b']
If you work with text json files as it seems, and that you know/trust those files, the fastest way would certainly be this one:
import glob
list_results = []
for filename in glob.glob("*json"):
with open("data.json", "r") as msg:
for line in msg:
if '"loc"' in line:
list_results.append(line.split('"')[3])
print (list_results)

Categories

Resources