multiple json to csv using pandas python - python

trying to convert multiple json files to 1 csv file
tried 2 ways,
first one using pandas ,
second using json and csv writer
about my json
keys are unordered and some keys are different in every file
code using writer
file_list=os.listdir('output')
count = 0
for file in file_list:
dict={}
file_path = "output/" + file
with open(file_path,'r') as f:
jsonData=json.load(f)
datafile=open('data.csv','a')
csv_writer = csv.writer(datafile)
if count == 0:
header = jsonData.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(jsonData.values())
if count == 1:
csv_writer.writerow(jsonData.values())
datafile.close()
problem
bcoz my data is unordered and different keys so in my csv file wrong value is coming under wrong header
code using pandas
for file in file_list:
dict={}
file_path = "output/" + file
with open(file_path,'r') as f:
jsonData=json.load(f)
for j in jsonData:
dict.update({j:[jsonData[j]]})
df=pd.DataFrame(dict)
df.to_csv("hello.csv")
problem
i dont know how to append in pandas
so this is showing only 2 rows bcoz of my last json file i guess
inside my json

Try this code:
import pandas as pd
import json
import pathlib
data_path = pathlib.Path('.')
keys = ['Solutions', 'account_number', 'actual_reading_current','actual_reading_previous', 'address', 'amount_due']
dat = dict([(k, []) for k in keys])
for jfile in data_path.glob('*.json'):
with jfile.open('r') as ifile:
json_data = json.load(ifile)
for key in keys:
dat[key].append(json_data[key][0] if key in json_data else None)
result = pd.DataFrame.from_dict(dat)
result.to_csv('result.csv')
I first define a dictionary containing the columns that I want.
Then I read in the json files and append them as rows to the dictionary.
Note, that I had to edit your json files, one was missing a ending quote and I had to replace the single quotes by double quotes.

Related

How to convert multiple json files to cvs files

Hello I have multiple json files in a path and I want to convert all of them to csv files separately. Here is what I have tried so far which just convert one json file to a csv file.
with open('/Users/hh/MyDataSet/traceJSON-663-661-A0-25449-7.json') as f:
for line in f:
data.append(json.loads(line))
csv_file=open('/Users/hh/MyDataSet/GTruth/traceJSON-663-661-A0-25449-7.csv','w')
write=csv.writer(csv_file)
# write.writerow(["row number","type","rcvTime","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
# ,"hed_x","hed_y","hed_z"])
write.writerow(["row number","type","rcvTime","sender","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
,"hed_x","hed_y","hed_z"])
for elem in range(len(data)):
if data[elem]['type']==2:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),'663',round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
elif data[elem]['type']==3:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),round(data[elem]['sender'],2),round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
# json_file.close()
print('done!')
csv_file.close()
I appreciate if anyone can help me how can I do it. Also in each json file name "traceJSON-663-661-A0-25449-7", the first number like in the above code (663) should be written in csv file like the following code,if the type is 2:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),'663',....
My json file names are like traceJSON-51-49-A16-25217-7, traceJSON-57-55-A0-25223-7, ....
I suggest using pandas for this:
from glob import glob
import pandas as pd
import os
filepaths = glob('/Users/hh/MyDataSet/*.json') # get list of json files in folder
for f in filepaths:
filename = os.path.basename(f).rsplit('.', 1)[0] # extract filename without extension
nr = int(filename.split('-')[1]) # extract the number from the filename - assuming that all filenames are formatted similarly, use regex otherwise
df = pd.read_json(f) # read the json file as a pandas dataframe, assuming the json file isn't nested
df['type'] = df['type'].replace(2, nr) # replace 2 in 'type' column with the number in the filename
df.to_csv(f'{filename}.csv') # save as csv
If you want to round columns, you can also do this with pandas
import csv
import glob
import json
import os.path
for src_path in glob.glob('/Users/hh/MyDataSet/*.json'):
src_name = os.path.splitext(os.path.basename(src_path))[0]
data = []
with open(src_path) as f:
for line in f:
data.append(json.loads(line))
dest_path = '/Users/hh/MyDataSet/GTruth/' + src_name + '.csv'
csv_file=open(dest_path,'w')
write=csv.writer(csv_file)
write.writerow(["row number","type","rcvTime","sender","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
,"hed_x","hed_y","hed_z"])
for elem in range(len(data)):
if data[elem]['type']==2:
sender = src_name.split('-')[1]
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),sender,round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
elif data[elem]['type']==3:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),round(data[elem]['sender'],2),round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
csv_file.close()
print('done!')

Split values in CSV that look like JSON

So I have a CSV file with a column called content. However, the contents in column look like it is based on JSON, and, therefore, house more columns. I would like to split these contents into multiple columns or extract the final part of it after "value". See picture below to see an example of the file. Any ideas how to get this? I would prefer using Python. I don't have any experience with JSON.
Using pandas you could do in a simpler way.
EDIT updated to handle the single quotes:
import pandas as pd
import json
data = pd.read_csv('test.csv', delimiter="\n")["content"]
res = [json.loads(row.replace("'", '"')) for row in data]
result = pd.DataFrame(res)
result.head()
# Export result to CSV
result.to_csv("result.csv")
my csv:
result:
This script will create a new csv file with the 'value' added to the csv as an additional column
(make sure that the input_csv and output_csv are different filenames)
import csv
import json
input_csv = "data.csv"
output_csv = "data_updated.csv"
values = []
with open(input_csv) as f_in:
dr = csv.DictReader(f_in)
for row in dr:
value = json.loads(row["content"].replace("'", '"'))["value"]
values.append(value)
with open(input_csv) as f_in:
with open(output_csv, "w+") as f_out:
w = csv.writer(f_out, lineterminator="\n")
r = csv.reader(f_in)
all = []
row = next(r)
row.append("value")
all.append(row)
i = 0
for row in r:
row.append(values[i])
all.append(row)
i += 1
w.writerows(all)

Generate a json file

I need to generate a .json file which has data in the following format:
{"cnt":[1950,1600,400,1250,995],
"dt":["2020-01","2020-02","2020-03","2020-04","2020-05"]}
I would prefer it getting generated by querying a table or using a CSV to JSON conversion. The format data I will have after querying or in my CSV file will be:
How to do this?
import csv
import json
with open('csv_file_path') as f:
dict_reader = csv.DictReader(f)
dicts = [dict(i) for i in dict_reader]
field_names = dict_reader.fieldnames #get the column headings # CNT,DT etc..
output_dict = {}
for item in field_names:
output_dict.setdefault(item,[])
for d in dicts:
for key in d:
output_dict[key].append(d[key])
with open('josn_file_path', 'w+') as f:
f.write(json.dumps(output_dict, indent=4))
Tested and works fine.
This example will turn your row-based data into a dict-based version.
Please keep in mind that I didn't test this - but it should work fine.
In essence this is what's happening:
Read the source data
Determine the headings you need for the dict
Fill the new data-format from your source_data
Dump this new format to a json file.
Code:
import csv
import json
# Read the source data
with open('path_to_csv_file') as f:
source_data = [i for i in csv.DictReader(f)]
# Discover headers, and prep dict framework.
target_data = {key: [] for key in source_data[0].keys()}
# Iterate over the source_data and append the values to the righ key in target_data
for row in source_data:
for k, v in row.items():
target_data[k].append(v)
# Write target data to json file
with open('path_to_json_file', 'w') as f:
json.dump(data, f)

Attempting to read_csv over a list generates a b literal I can't get rid of

I have a list of countries with corresponding .csv files. When I attempt to read_csv iterated over the list with a for loop, I get an error.
I tried generating an empty dict first and making a dict of dataframes, I tried using decode, I tried using item = r'{}.csv'.format(file) instead of just item = '{}.csv'.format(file).
import pandas as pd
import string as str
fileslist = []
with open('data/files.txt') as f:
for line in f:
fileslist.append(f.readline().strip())
for file in fileslist:
item = '{}.csv'.format(file)
print(item)
item = pd.read_csv(item)
This should give me a number of dataframes starting with a dataframe named algeria. Instead I get the error "FileNotFoundError: File b'algeria.csv' does not exist".
This code may help you
import os
import pandas as pd
fileslist = []
with open("data/files.txt", mode='r', encoding="utf-8") as fp:
for line in fp.readlines():
fileslist.append(line.strip())
for file in fileslist:
# make sure your files are in same directory
# if they are in data folder then don't forget to add 'data/{}.csv'.format(file)
item = '{}.csv'.format(file)
if os.path.isfile(item):
item = pd.read_csv(item)

Parsing .DAT file with Python

I need to convert a .dat file that's in a specific format into a .csv file.
The .dat file has multiple rows with a repeating structure. The data is held in brackets and have tags. Below is the sample data; it repeats throughout the data file:
{"name":"ABSDSDSRF","ID":"AFJDKGFGHF","lat":37,"lng":-122,"type":0,"HAC":5,"verticalAccuracy":4,"course":266.8359375,"area":"san_francisco"}
Can anyone provide a starting point for the script?
This will create a csv assuming each line in your .DAT is json. Just order the header list to your liking
import csv, json
header = ['ID', 'name', 'type', 'area', 'HAC', 'verticalAccuracy', 'course', 'lat', 'lng']
with open('file.DAT') as datfile:
with open('output.csv', 'wb') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=header)
writer.writeheader()
for line in datfile:
writer.writerow(json.loads(line))
Your row is in json format. So, you can use:
import json
data = json.loads('{"name":"ABSDSDSRF","ID":"AFJDKGFGHF","lat":37,"lng":-122,"type":0,"HAC":5,"verticalAccuracy":4,"course":266.8359375,"area":"san_francisco"}')
print data.get('name')
print data.get('ID')
This is only a start point. You have to iter all the .dat file. At the end, you have to write an exporter to save the data into the csv file.
Use a regex to find all of the data items. Use ast.literal_eval to convert each data item into a dictionary. Collect the items in a list.
import re, ast
result = []
s = '''{"name":"ABSDSDSRF","ID":"AFJDKGFGHF","lat":37,"lng":-122,"type":0,"HAC":5,"verticalAccuracy":4,"course":266.8359375,"area":"san_francisco"}'''
item = re.compile(r'{[^}]*?}')
for match in item.finditer(s):
d = ast.literal_eval(match.group())
result.append(d)
If each data item is on a separate line in the file You don't need the regex - you can just iterate over the file.
with open('file.dat') as f:
for line in f:
line = line.strip()
line = ast.literal_eval(line)
result.append(line)
Use json.load:
import json
with open (filename) as fh:
data = json.load (fh)
...

Categories

Resources