I am trying to convert CSV file to JSON file based on a column value. The csv file looks somewhat like this.
ID Name Age
CSE001 John 18
CSE002 Marie 20
ECE001 Josh 22
ECE002 Peter 23
currently I am using the following code to obtain json file.
import csv
import json
def csv_to_json(csv_file_path, json_file_path):
data_dict = {}
with open(csv_file_path, encoding = 'utf-8') as csv_file_handler:
csv_reader = csv.DictReader(csv_file_handler)
for rows in csv_reader:
key = rows['ID']
data_dict[key] = rows
with open(json_file_path, 'w', encoding = 'utf-8') as json_file_handler:
json_file_handler.write(json.dumps(data_dict, indent = 4))
OUTPUT:
**{
"CSE001":{
"ID":"CSE001",
"Name":"John",
"Age":18
}
"CSE002":{
"ID":"CSE002",
"Name":"Marie",
"Age":20
}
"ECE001":{
"ID":"ECE001",
"Name":"Josh",
"Age":22
}
"ECE002":{
"ID":"ECE002",
"Name":"Peter",
"Age":23
}
}**
I want my output to generate two separate json files for CSE and ECE based on the ID value. Is there a way to achieve this output.
Required Output:
CSE.json:
{
"CSE001":{
"ID":"CSE001",
"Name":"John",
"Age":18
}
"CSE002":{
"ID":"CSE002",
"Name":"Marie",
"Age":20
}
}
ECE.json:
{
"ECE001":{
"ID":"ECE001",
"Name":"Josh",
"Age":22
}
"ECE002":{
"ID":"ECE002",
"Name":"Peter",
"Age":23
}
}
I would suggest you to use pandas, that way will be more easier.
Code may look like:
import pandas as pd
def csv_to_json(csv_file_path):
df = pd.read_csv(csv_file_path)
df_CSE = df[df['ID'].str.contains('CSE')]
df_ECE = df[df['ID'].str.contains('ECE')]
df_CSE.to_json('CSE.json')
df_ECE.to_json('ESE.json')
You can create dataframe and then do the following operation
import pandas as pd
df = pd.DataFrame.from_dict({
"CSE001":{
"ID":"CSE001",
"Name":"John",
"Age":18
},
"CSE002":{
"ID":"CSE002",
"Name":"Marie",
"Age":20
},
"ECE001":{
"ID":"ECE001",
"Name":"Josh",
"Age":22
},
"ECE002":{
"ID":"ECE002",
"Name":"Peter",
"Age":23
}
},orient='index')
df["id_"] = df["ID"].str[0:2] # temp column for storing first two chars
grps = df.groupby("id_")[["ID", "Name", "Age"]]
for k, v in grps:
print(v.to_json(orient="index")) # you can create json file as well
You could store each row into two level dictionary with the top level being the first 3 characters of the ID.
These could then be written out into separate files with the key being part of the filename:
from collections import defaultdict
import csv
import json
def csv_to_json(csv_file_path, json_base_path):
data_dict = defaultdict(dict)
with open(csv_file_path, encoding = 'utf-8') as csv_file_handler:
csv_reader = csv.DictReader(csv_file_handler)
for row in csv_reader:
key = row['ID'][:3]
data_dict[key][row['ID']] = row
for key, values in data_dict.items():
with open(f'{json_base_path}_{key}.json', 'w', encoding='utf-8') as json_file_handler:
json_file_handler.write(json.dumps(values, indent = 4))
csv_to_json('input.csv', 'output')
The defaultdict is used to avoid needing to first test if a key is already present before using it.
This would create output_CSE.json and output_ECE.json, e.g.
{
"ECE001": {
"ID": "ECE001",
"Name": "Josh",
"Age": "22"
},
"ECE002": {
"ID": "ECE002",
"Name": "Peter",
"Age": "23"
}
}
Related
I have many json files under /json/reports/ location and for each json file the output need to be converted into csv file individually.
I have the following python code to convert.
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
jsonData = pd.read_json(f.read())
jsonData.to_csv(f.name+".csv")
f.close()
One of my json file (First few lines) output as follows.
[
{
"api_result": "KEY_NAME",
"ml_result": "VALUE",
"line_data_list": [
{
"line": "54A2FF607A6dsewroadeEOERD> |-",
"line_num": 9053,
"path": "/home/user/src/common/race/flow/prog_flow.mk",
"value": "WOERJFOQDKSDFKKASDF0",
"variable": null,
"entropy_validation": true
}
],
"ml_part": 0.994396984577179,
"rule": "GCP Client ID",
"severity": "high"
},
{
"api_result": "NOT_AVAILABLE",
"ml_result": "NOT_AVAILABLE",
"line_data_list": [
{
"line": "-----BEGIN Result-----",
"line_num": 19873,
"path": "/home/user/test/linux/ops/format.key",
"value": "-----BEGIN RSA PRIVATE",
"variable": null,
"entropy_validation": false
}
],
"ml_part": null,
"rule": "Certificate",
"severity": "low"
},
.....
.......
..........
Problem:-:-
The above python code writing line_data_list list values (line, line_num, path, value, variable, & entropy_validation) in single column, but I need each value in a seprate column. (Ie specified in below format).
Expected output csv per json file:-
Sl.no
api_result
ml_result
line_data_list
line
line_num
path
value
variable
entropy_validation
ml_part
rule
severity
1
KEY_NAME
VALUE
54A2FF607A6dsewroadeEOERD
9053
/home/user98/src/common/race/flow/prog_flow.mk
WOERJFOQDKSDFKKASDFO
null
TRUE
0.994396985
GCP Client ID
high
2
NOT_AVAILABLE
NOT_AVAILABLE
-----BEGIN Result-----
19873
/home/user/test/linux/ops/format.key
-----BEGIN RSA PRIVATE
null
false
null
Certificate
low
3
Need help to print each in separate column.
I have this json file:
df = pd.read_json("mydata.json")
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
pd.concat([df, t], axis=1)
In your case:
import pandas as pd
import glob
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
df = pd.read_json(file)
t = df['line_data_list'].apply(lambda x: pd.Series(x[0]))
df = pd.concat([df, t], axis=1)
df.to_csv(f'{file[:-5]}.csv')
Output:
You need to unpack your line_data_list key-value pairs so they occur on the same level as your other columns. Something like what I've written below would work.
import pandas as pd
import glob
import json
path = '/json/reports/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
json_dict = json.load(f)
line_data = json_dict[0].pop("line_data_list")
{json_dict.update(header, val) for header, val in line_data.items()}
jsonData = pd.from_dict(json_dict, orient="records")
jsonData.to_csv(f.name+".csv")
f.close()
Currently, I have a CSV file with the following example --
File
skill
experience
overall_experience
1
Java
1.5
3
1
Python
1.0
3
1
SQL
0.5
3
There are multiple entries for many such files but I need to merge the skills and their respective experience into a single value belonging to a single key, something like this -
{
"1": {
"file": "1",
"skill": ["Java", "Python", "SQL"],
"experience": [1.5, 1.0, 0.5]
"Overall_exp": 3.0
},}
I tried a Python Code for this but it is giving me only the value of last skill and last experience (and not the whole thing in a list)
Here is the code I was using --
import csv
import json
# Function to convert a CSV to JSON
# Takes the file paths as arguments
def make_json(csvFilePath, jsonFilePath):
# create a dictionary
data = {}
# Open a csv reader called DictReader
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
# Convert each row into a dictionary
# and add it to data
for rows in csvReader:
# Assuming a column named 'file' to
# be the primary key
key = rows['file']
data[key] = rows
# Open a json writer, and use the json.dumps()
# function to dump data
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
# Decide the two file paths according to your
# computer system
csvFilePath = 'skill_matrix.csv'
jsonFilePath = 'skill_matrix.json'
# Call the make_json function
make_json(csvFilePath, jsonFilePath)
The output that I get here is this --
{
"1": {
"file": "1",
"skill": "SQL",
"experience": "0.5"
"Overall_exp": "3.0"
},}
How can I convert it to the former json format and not the latter?
You can use pandas to read your csv, group by File and export to json:
df = pd.read_csv(your_csv)
df = df.groupby('File', as_index=False).agg({'skill': list, 'experience': list, 'overall_experience': np.mean})
print(df.to_json(orient='index', indent=4))
Note: you can specify the aggregation functions for your columns in a dictionary
Output:
{
"0":{
"File":1,
"skill":[
"Java",
"Python",
"SQL"
],
"experience":[
1.5,
1.0,
0.5
],
"overall_experience":3.0
}
}
I think that loading into Pandas first and then going from all data to the narrowing strategy is cleaner and easier. You can use the following code for parsing your data into JSON files;
import pandas as pd
import json
# Load the CSV into Pandas
df = pd.read_csv('1.csv', header=0)
data = df.to_dict(orient='list')
# Delete / change as you wish
data['File'] = str(data['File'][0])
data['overall_experience'] = data['overall_experience'][0]
# Save as json
with open('1.json', 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
Result (1.json)
{
"File": "1",
"skill": [
"Java",
"Python",
"SQL"
],
"experience": [
1.5,
1.0,
0.5
],
"overall_experience": 3
}
I suppose hat you have multiple file id in a CSV file. Your given example is too minimalistic. Anyhow, then you can create a master dictionary and add your smaller ones as follows;
import pandas as pd
import json
# Load the CSV into Pandas
df = pd.read_csv('1.csv', header=0)
# Master dictionary
master_dict = {}
for idx, file_id in enumerate(df["File"].unique()):
data = df[df['File'] == file_id].to_dict(orient='list')
# Delete / change as you wish
data['File'] = str(data['File'][0])
data['overall_experience'] = data['overall_experience'][0]
master_dict[idx] = data
# Save as json
with open('1.json', 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(master_dict, indent=4))
Result (1.json)
{
"0": {
"File": "1",
"skill": [
"Java",
"Python",
"SQL"
],
"experience": [
1.5,
1.0,
0.5
],
"overall_experience": 3
},
"1": {
"File": "2",
"skill": [
"Java",
"Python"
],
"experience": [
2.0,
2.5
],
"overall_experience": 1
}
}
If you don't want to use Pandas, you could try:
import csv
import json
def make_json(csvfile_path, jsonfile_path):
data = {}
with open(csvfile_path, "r") as csvfile:
next(csvfile) # Skip header line
for row in csv.reader(csvfile):
fdata = data.setdefault(row[0], {"file": row[0]})
fdata.setdefault("skill", []).append(row[1])
fdata.setdefault("experience", []).append(float(row[2]))
fdata.setdefault("overall_experience", []).append(float(row[3]))
with open(jsonfile_path, "w") as jsonfile:
json.dump(data, jsonfile)
The main difference to your approach is the explicit structuring of the inner dicts: values are lists (except for the 'file' key). The dict.setdefault() is great here: You can set a value for a key if it isn't in the dict, and get the value back (either the newly set one or the existing). So you can put a list in the dict, get it back, and can immediately .append() to it.
If you want to use a csv.DictReader:
def make_json(csvfile_path, jsonfile_path):
data = {}
with open(csvfile_path, "r") as csvfile:
for row in csv.DictReader(csvfile):
fdata = data.setdefault(row["file"], {"file": row["file"]})
for key, value in list(row.items())[1:]:
fdata.setdefault(key, []).append(
value if key == "skill" else float(value)
)
with open(jsonfile_path, "w") as jsonfile:
json.dump(data, jsonfile)
(I haven't, since I wasn't sure about the actual column names.)
I am trying to scrape some json data. The first few rows ae as follows and all the latter is in the same format.
Json data:
{
"data": [
{
"date": "2011-10-07",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-08",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-12",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-13",
"f(avg(output_total)/number(100000000))": 54.0515120216902
},.......]
I am willing scrape the date with the its relevant value (like fi=or the above, 2011-10-07 and 50, 2011-10-08 and 50 etc.) into a csv file which contains two columns (date and value)
How can I proceed this? is it possible with python?
This is how I grabbed the json data:
import os
import requests
url='https://api.blockchair.com/litecoin/transactions?a=date,f(avg(output_total)/number(100000000))'
proxies = {}
response = requests.get(url=url, proxies=proxies)
print(response.content)
pandas allows you to solve this one in a few lines:
import pandas as pd
df = pd.DataFrame(json_data['data'])
df.columns = ["date", "value"]
df.to_csv("data.csv", index=False)
json = {
"data": [
{
"date": "2011-10-07",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-08",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-12",
"f(avg(output_total)/number(100000000))": 50
},
{
"date": "2011-10-13",
"f(avg(output_total)/number(100000000))": 54.0515120216902
}]}
Step 1: Convert json into a Pandas Dataframe
df = pd.DataFrame(json['data'])
Step 2: Filter Df based on conditions ( e.g >>> value = 50)
df_filtered = df[(df["f(avg(output_total)/number(100000000))"] == 50)]
Step 3: Save df into csv file and choose the location where you like to store the CSV file on your computer.
df_filtered.to_csv(r'C:\user\foo\output.csv', index = False)
if you wish to include the index, then simply remove index = False
You can do like this.
Iterate over the JSON string, extract the data you need and then write that data to CSV file.
import json
import csv
fields = ['Date', 'Value']
filename = 'test.csv'
s = """
{
"data":[
{
"date":"2011-10-07",
"f(avg(output_total)/number(100000000))":50
},
{
"date":"2011-10-08",
"f(avg(output_total)/number(100000000))":50
},
{
"date":"2011-10-12",
"f(avg(output_total)/number(100000000))":50
},
{
"date":"2011-10-13",
"f(avg(output_total)/number(100000000))":54.0515120216902
}
]
}
"""
x = json.loads(s)
with open(filename, 'w', newline='') as f:
cw = csv.writer(f)
cw.writerow(fields)
for i in x['data']:
cw.writerow(i.values())
test.csv
Date Value
07-10-11 50
08-10-11 50
12-10-11 50
13-10-11 54.05151202
If you just want a CSV file without relying on any additional Python modules (such as pandas) then it's very simple:
import requests
CSV = 'blockchair.csv'
url='https://api.blockchair.com/litecoin/transactions?a=date,f(avg(output_total)/number(100000000))'
with requests.Session() as session:
response = session.get(url)
response.raise_for_status()
with open(CSV, 'w') as csv:
csv.write('Date,Value\n')
for d in response.json()['data']:
for i, v in enumerate(d.values()):
if i > 0:
csv.write(',')
csv.write(str(v))
csv.write('\n')
You can try this:
import requests
import csv
import pandas as pd
url='https://api.blockchair.com/litecoin/transactions?a=date,f(avg(output_total)/number(100000000))'
csv_name = 'res_values_1.csv'
response = requests.get(url=url).json()
res_data = response.get('data', [])
# Solution using pandas
res_df = pd.DataFrame(res_data)
res_df.rename(columns={'f(avg(output_total)/number(100000000))': 'value'}, inplace=True)
# filter data those value in >= 50
filtered_res_df = res_df[(res_df["value"] >= 50)]
filtered_res_df.to_csv(csv_name, sep=',', encoding='utf-8', index = False)
# Solution using csv
csv_name = 'res_values_2.csv'
headers = ['date', 'value']
with open(csv_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
for data in res_data:
values = list(data.values())
if values[1] >= 50:
writer.writerow(values)
CSV Output:
date,value
2011-10-07,50.0
2011-10-08,50.0
2011-10-12,50.0
2011-10-13,54.0515120216902
.
.
.
2021-10-05,346.12752821011594
2021-10-06,293.5061907016782
2021-10-07,333.17665010641673
2021-10-08,332.2437737707938
I have written a code to convert csv file to nested json format. I have multiple columns to be nested hence assigning separately for each column. The problem is I'm getting 2 fields for the same column in the json output.
import csv
import json
from collections import OrderedDict
csv_file = 'data.csv'
json_file = csv_file + '.json'
def main(input_file):
csv_rows = []
with open(input_file, 'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
row['TYPE'] = 'REVIEW', # adding new key, value
row['RAWID'] = 1,
row['CUSTOMER'] = {
"ID": row['CUSTOMER_ID'],
"NAME": row['CUSTOMER_NAME']
}
row['CATEGORY'] = {
"ID": row['CATEGORY_ID'],
"NAME": row['CATEGORY']
}
del (row["CUSTOMER_NAME"], row["CATEGORY_ID"],
row["CATEGORY"], row["CUSTOMER_ID"]) # deleting since fields coccuring twice
csv_rows.append(row)
with open(json_file, 'w') as f:
json.dump(csv_rows, f, sort_keys=True, indent=4, ensure_ascii=False)
f.write('\n')
The output is as below:
[
{
"CATEGORY": {
"ID": "1",
"NAME": "Consumers"
},
"CATEGORY_ID": "1",
"CUSTOMER_ID": "41",
"CUSTOMER": {
"ID": "41",
"NAME": "SA Port"
},
"CUSTOMER_NAME": "SA Port",
"RAWID": [
1
]
}
]
I'm getting 2 entries for the fields I have assigned using row[''].
Is there any other way to get rid of this? I want only one entry for a particular field in each record.
Also how can I convert the keys to lower case after reading from csv.DictReader(). In my csv file all the columns are in upper case and hence I'm using the same to assign. But I want to convert all of them to lower case.
In order to convert the keys to lower case, it would be simpler to generate a new dict per row. BTW, it should be enough to get rid of the duplicate fields:
for row in reader:
orow = collection.OrderedDict()
orow['type'] = 'REVIEW', # adding new key, value
orow['rawid'] = 1,
orow['customer'] = {
"id": row['CUSTOMER_ID'],
"name": row['CUSTOMER_NAME']
}
orow['category'] = {
"id": row['CATEGORY_ID'],
"name": row['CATEGORY']
}
csv_rows.append(orow)
I have a CSV file that is structured as below :
Store, Region, District, MallName, Location
1234,90,910,MallA,GMT
4567,87,902,MallB,EST
2468,90,811,MallC,PST
1357,87,902,MallD,CST
What I was able to accomplish with my iterative brow-beating was getting a format like so:
{
"90": {
"910": {
"1234": {
"name": "MallA",
"location": "GMT"
}
},
"811": {
"2468": {
"name": "MallB",
"location": "PST"
}
}
},
"87": {
"902": {
"4567": {
"name": "MallB",
"location": "EST"
},
"1357": {
"name": "MallD",
"location": "CST"
}
}
}
}
The code below is stripped down to match the sample data set I provided but you get the idea as to what is happening. Again, it's very iterative and non-pythonic which I'm trying to also move towards. (If anyone feels the defined procedures would be worthwhile to post I can).
#************
# Main()
#************
dictHierarchy = {}
with open(getFilePath(), 'r') as f:
content = [line.strip('\n') for line in f.readlines()]
for data in content:
data = data.split(",")
myRegion = data[1]
myDistrict = data[2]
myName = data[3]
myLocation = data[4]
myStore = data[0]
if myRegion in dictHierarchy:
#check for District
if myDistrict in dictHierarchy[myRegion]:
#checkforStore
dictHierarchy[myRegion][myDistrict].update({myStore:addStoreDetails(data)})
else:
#add district
dictHierarchy[myRegion].update({myDistrict:addStore(data)})
else:
#add region
dictHierarchy.update({myRegion:addDistrict(data)})
with open('hierarchy.json', 'w') as outfile:
json.dump(dictHierarchy, outfile)
Obsessive compulsive me looked at the JSON output above and thought that to someone blindly opening the file it looks like a hodge-podge. What I was hoping to do for plain-text readability is group the data and throw it into JSON format as so:
{"Regions":[
{"Region":"90", "Districts":[
{"District":"910", "Stores":[
{"Store":"1234", "name":"MallA", "location":"GMT"}]},
{"District":"811", "Stores":[
{"Store":"2468", "name":"MallC", "location":"PST"}]}]},
{"Region":"87", "Districts":[
{"District":"902", "Stores":[
{"Store":"4567", "name":"MallB", "location":"EST"},
{"Store":"1357", "name":"MallD", "location":"CST"}]}]}]}
Long story short I wasted quite some time today trying to sort out how to actually populate the data structure in Python and essentially ended up no where. Is there a clean, pythonic way to achieve this? Is it even worth the effort?
I've added headers to your input like:
Store,Region,District,name,location
1234,90,910,MallA,GMT
4567,87,902,MallB,EST
2468,90,811,MallC,PST
1357,87,902,MallD,CST
then used python csv reader and group by like this:
import csv
from itertools import groupby, ifilter
from operator import itemgetter
data = []
with open('in.csv') as csvfile:
reader = csv.DictReader(csvfile)
regions = []
regions_dict = sorted(list(reader), key=itemgetter('Region'))
for region_id, region_group in groupby(regions_dict, itemgetter('Region')):
districts = []
regions.append({'Region': region_id, 'Districts': districts})
districts_dict = sorted(region_group, key=itemgetter('District'))
for district_id, district_group in groupby(districts_dict, itemgetter('District')):
districts.append({'District': district_id, 'Stores': list(district_group)})
print regions