Converting a large CSV file to multiple JSON files using Python

Converting a large CSV file to multiple JSON files using Python - python

I am currently using the following code to convert a large CSV file to a JSON file.
import csv
import json
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for row in csvReader:
jsonArray.append(row)
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
csvFilePath = r'test_data.csv'
jsonFilePath = r'test_data.json'
csv_to_json(csvFilePath, jsonFilePath)
This code works fine and I am able to convert the CSV to JSON without any issues. However, as the CSV file contains 600,000+ rows and hence as many items in my JSON, it has become very difficult to manage the JSON file.
I would like to modify my above code such that for every 5000 rows of the CSV, the data is written into a new JSON file. Ideally, I would be having 120 (600,000/5000) JSON files in this case.
How can I do the same?

Split up your read\write methods and add a simple threshold:
JSON_ENTRIES_THRESHOLD = 5000 # modify to whatever you see suitable
def write_json(json_array, filename):
with open(filename, 'w', encoding='utf-8') as jsonf:
json.dump(json_array, jsonf) # note the usage of .dump directly to a file descriptor
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
filename_index = 0
for row in csvReader:
jsonArray.append(row)
if len(jsonArray) >= JSON_ENTRIES_THRESHOLD:
# if we reached the treshold, write out
write_json(jsonArray, f"jsonFilePath-{filename_index}.json")
filename_index += 1
jsonArray = []
# Finally, write out the remainder
write_json(jsonArray, f"jsonFilePath-{filename_index}.json")

Related

CSV to JSON Mass converter in Python

I have a folder it is called DATA, inside that folder there is multiple .logs files and it is formatted as CSV . Now I want to convert every single .logs files inside DATA folder using Python.
import csv
import json
import glob, os
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
#read csv file
with open(csvFilePath, encoding='utf-8') as csvf:
#load csv file data using csv library's dictionary reader
csvReader = csv.DictReader(csvf)
#convert each csv row into python dict
for row in csvReader:
#add this python dict to json array
jsonArray.append(row)
#convert python jsonArray to JSON String and write to file
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
os.chdir(r"C:\Users\Arda\Desktop\DATA")# use whatever directory you want
#double\\ no single \
for file in glob.glob("**/*.logs", recursive = True):
csvFilePath = []
csvFilePath = file
jsonFilePath = r'data.json'
csv_to_json(csvFilePath, jsonFilePath)
I can only convert one single file but there is multiple .logs file as CSV
In that list last one has been converted to JSON "T1555.logs"
T1003.001.logs
T1003.002.logs
T1003.003.logs
T1003.004.logs
T1003.logs
T1552.002.logs
T1552.004.logs
T1555.003.logs
T1555.logs

I'd re-arrange where you're traversing the files so that all of the results are stored in a single jsonArray, then written to the file at the end:
import csv
import json
import glob, os
def csvs_to_json(csvFilePaths, jsonFilePath):
jsonArray = []
for csvFilePath in csvFilePaths:
#read csv file
with open(csvFilePath, encoding='utf-8') as csvf:
#load csv file data using csv library's dictionary reader
csvReader = csv.DictReader(csvf)
#convert each csv row into python dict
for row in csvReader:
#add this python dict to json array
jsonArray.append(row)
#convert python jsonArray to JSON String and write to file
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
os.chdir(r"C:\Users\Arda\Desktop\DATA")# use whatever directory you want
#double\\ no single \
files = list(glob.glob("**/*.logs", recursive = True))
jsonFilePath = r'data.json'
csvs_to_json(files, jsonFilePath)
Let me know if this works for you!

csv header in python only on the top row?

i have written a python program which makes an api call to a webserver once every minute and then parse the json response and saves parsed values in to the csv files.
here is the code that is saving the values into the csv file :
with open('data.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
how can i make it so that it saves the header only once on the top most row and not on every row ?
UPDATE:
here is a bit of more code to make api call and write the data to file :
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.cron import CronTrigger
import requests
import json
import csv
from datetime import datetime
def fn():
print("Starting...")
session_id = "auZsJ4F2RsQNJxSPTMDt2238324"
Outlook='http://www.myfxbook.com/api/get-community-outlook.json?session=' + session_id
Outlook_response = requests.get(Outlook)
Outlook_data = Outlook_response.json()['symbols']
now = datetime.now()
current_time = now.strftime("%H:%M")
EURUSD=Outlook_data[0]
SHORTPERC=EURUSD['shortPercentage']
LONGPERC =EURUSD['longPercentage']
SHORTvolume=EURUSD['shortVolume']
longVolume=EURUSD['longVolume']
longPositions=EURUSD['longPositions']
shortPositions=EURUSD['shortPositions']
with open('data.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
with open('data1.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([SHORTvolume, longVolume])
with open('data2.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([SHORTPERC, LONGPERC])
i cant post the full code cuz it will be very ugly since its around 700 lines long , but the above mentioned code should work to create the csv file
this is how one of my csv files look :
07:11,31,69,555.55,1265.14,4750,2607
07:12,31,69,555.55,1265.16,4751,2607
07:13,31,69,555.55,1265.16,4751,2607
07:14,30,70,555.56,1267.36,4752,2608
07:15,30,70,555.56,1267.36,4752,2608
07:16,30,70,555.56,1267.36,4752,2608
07:17,30,70,555.46,1267.36,4752,2607
07:18,31,69,558.61,1267.36,4752,2610
07:19,31,69,558.61,1267.37,4753,2610
07:20,31,69,561.58,1267.37,4753,2611
07:21,31,69,561.61,1267.37,4753,2613
07:22,31,69,561.65,1267.37,4753,2614
07:23,31,69,561.65,1267.36,4752,2614
this is just part of the csv file , more rows keep adding as time passes
EDIT 2:
answer suggested by Sparkofska seems to work but somehow it ends up giving an empty row in between every line like this:
Time,ShortPer,LongPer,ShortVolume,LongVolume,ShortPosition,LongPosition
05:47,44,56,19528.8,24789.27,65223,48630
05:48,44,56,19529.04,24789.27,65223,48633
code :
EURUSD=Outlook_data[0]
SHORTPERC=EURUSD['shortPercentage']
LONGPERC =EURUSD['longPercentage']
SHORTvolume=EURUSD['shortVolume']
longVolume=EURUSD['longVolume']
longPositions=EURUSD['longPositions']
shortPositions=EURUSD['shortPositions']
filename='EURUSD.csv';
def write_row_header_aware(filename, row):
if not os.path.exists(filename) or os.stat(filename).st_size == 0:
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow(['Time', 'ShortPer', 'LongPer','ShortVolume','LongVolume','ShortPosition','LongPosition'])
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
write_row_header_aware(filename, [current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
print("done...")

You could wrap the writerow function to have it automatically add the header if needed.
If your output csv file is not empty, we can assert the header was already written and simply append the row. Otherwise (file not exist or empty) we write the header before appending the row.
import os
def write_row_header_aware(filename, row):
# in case file doesn't exist or is empty
if not os.path.exists(filename) or os.stat(filename).st_size == 0:
# write header
with open(filename, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(['current_time', 'SHORTPERC', 'LONGPERC', ...])
# write line as usual
with open(filename, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(row)
write_row_header_aware('data.csv', [current_time, SHORTPERC, LONGPERC, ...])

Please make a check to know if the file exists, if it already exists use append to write rows to file else write the headers. By this way you could avoid writing headers multiple times. Please refer to this link

Converting JSON to CSV, CSV is empty

I'm attempting to convert yelps data set that is in JSON to a csv format. The new csv file that is created is empty.
I've tried different ways to iterate through the JSON but they all give me a zero bytes file.
The json file looks like this:
{"business_id":"1SWheh84yJXfytovILXOAQ","name":"Arizona Biltmore Golf Club","address":"2818 E Camino Acequia Drive","city":"Phoenix","state":"AZ","postal_code":"85016","latitude":33.5221425,"longitude":-112.0184807,"stars":3.0,"review_count":5,"is_open":0,"attributes":{"GoodForKids":"False"},"categories":"Golf, Active Life","hours":null}
import json
import csv
infile = open("business.json","r")
outfile = open("business2.csv","w")
data = json.load(infile)
infile.close()
out = csv.writer(outfile)
out.writerow(data[0].keys())
for row in data:
out.writerow(row.values())
I get an "extra data" message when the code runs. The new business2 csv file is empty and the size is zero bytes.

if you JSON has only one row.. then try this
infile = open("business.json","r")
outfile = open("business2.csv","w")
data = json.load(infile)
infile.close()
out = csv.writer(outfile)
#print(data.keys())
out.writerow(data.keys())
out.writerow(data.values())

Hi Please try the below code, by using with command the file access will automatically get closed when the control moves out of scope of with
infile = open("business.json","r")
outfile = open("business2.csv","w")
data = json.load(infile)
infile.close()
headers = list(data.keys())
values = list(data.values())
with open("business2.csv","w") as outfile:
out = csv.writer(outfile)
out.writerow(headers)
out.writerow(values)

You need to use with to close file.
import json
import csv
infile = open("business.json","r")
data = json.load(infile)
infile.close()
with open("business2.csv","w") as outfile:
out = csv.writer(outfile)
out.writerow(list(data.keys()))
out.writerow(list(data.values()))

JSON like data to CSV file in python - not showing headers correctly

I am transforming JSON like data to CSV and having a few issues.
The code is here:
import json
import csv
def parse_file(inputed_file):
with open(input_file, 'r') as inputed_file:
content = inputed_file.readlines()
split_file = open('test.csv', 'w')
for line in content:
lines = line.split('\t')
data = json.loads(lines[0])
writer = csv.DictWriter(split_file, fieldnames = ["title", "firstname"], delimiter = ',')
writer.writeheader()
The problem is this is adding a header on each row for the data, I want to only have the header displayed once. Then add this for the data to go below the headers:
writer.writerow(data)
I have looked at this and tried it but failed: How can I convert JSON to CSV?.

Create the DictWriter outside the loop, and just call writer.writeheader() there. Then call writer.writerow() inside the loop.
def parse_file(inputed_file):
with open(input_file, 'r') as inputed_file:
content = inputed_file.readlines()
split_file = open('test.csv', 'w')
writer = csv.DictWriter(split_file, fieldnames = ["title", "firstname"], delimiter = ',')
writer.writeheader()
for line in content:
lines = line.split('\t')
data = json.loads(lines[0])
writer.writerow(data)

How to convert CSV file to multiline JSON?

Here's my code, really simple stuff...
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
out = json.dumps( [ row for row in reader ] )
jsonfile.write(out)
Declare some field names, the reader uses CSV to read the file, and the filed names to dump the file to a JSON format. Here's the problem...
Each record in the CSV file is on a different row. I want the JSON output to be the same way. The problem is it dumps it all on one giant, long line.
I've tried using something like for line in csvfile: and then running my code below that with reader = csv.DictReader( line, fieldnames) which loops through each line, but it does the entire file on one line, then loops through the entire file on another line... continues until it runs out of lines.
Any suggestions for correcting this?
Edit: To clarify, currently I have: (every record on line 1)
[{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"},{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}]
What I'm looking for: (2 records on 2 lines)
{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"}
{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}
Not each individual field indented/on a separate line, but each record on it's own line.
Some sample input.
"John","Doe","001","Message1"
"George","Washington","002","Message2"

The problem with your desired output is that it is not valid json document,; it's a stream of json documents!
That's okay, if its what you need, but that means that for each document you want in your output, you'll have to call json.dumps.
Since the newline you want separating your documents is not contained in those documents, you're on the hook for supplying it yourself. So we just need to pull the loop out of the call to json.dump and interpose newlines for each document written.
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
json.dump(row, jsonfile)
jsonfile.write('\n')

You can use Pandas DataFrame to achieve this, with the following Example:
import pandas as pd
csv_file = pd.DataFrame(pd.read_csv("path/to/file.csv", sep = ",", header = 0, index_col = False))
csv_file.to_json("/path/to/new/file.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)

import csv
import json
file = 'csv_file_name.csv'
json_file = 'output_file_name.json'
#Read CSV File
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Convert csv data into json
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': '))) #for pretty
f.write(json.dumps(data))
read_CSV(file,json_file)
Documentation of json.dumps()

I took #SingleNegationElimination's response and simplified it into a three-liner that can be used in a pipeline:
import csv
import json
import sys
for row in csv.DictReader(sys.stdin):
json.dump(row, sys.stdout)
sys.stdout.write('\n')

You can try this
import csvmapper
# how does the object look
mapper = csvmapper.DictMapper([
[
{ 'name' : 'FirstName'},
{ 'name' : 'LastName' },
{ 'name' : 'IDNumber', 'type':'int' },
{ 'name' : 'Messages' }
]
])
# parser instance
parser = csvmapper.CSVParser('sample.csv', mapper)
# conversion service
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
Edit:
Simpler approach
import csvmapper
fields = ('FirstName', 'LastName', 'IDNumber', 'Messages')
parser = CSVParser('sample.csv', csvmapper.FieldMapper(fields))
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)

I see this is old but I needed the code from SingleNegationElimination however I had issue with the data containing non utf-8 characters. These appeared in fields I was not overly concerned with so I chose to ignore them. However that took some effort. I am new to python so with some trial and error I got it to work. The code is a copy of SingleNegationElimination with the extra handling of utf-8. I tried to do it with https://docs.python.org/2.7/library/csv.html but in the end gave up. The below code worked.
import csv, json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("Scope","Comment","OOS Code","In RMF","Code","Status","Name","Sub Code","CAT","LOB","Description","Owner","Manager","Platform Owner")
reader = csv.DictReader(csvfile , fieldnames)
code = ''
for row in reader:
try:
print('+' + row['Code'])
for key in row:
row[key] = row[key].decode('utf-8', 'ignore').encode('utf-8')
json.dump(row, jsonfile)
jsonfile.write('\n')
except:
print('-' + row['Code'])
raise

Add the indent parameter to json.dumps
data = {'this': ['has', 'some', 'things'],
'in': {'it': 'with', 'some': 'more'}}
print(json.dumps(data, indent=4))
Also note that, you can simply use json.dump with the open jsonfile:
json.dump(data, jsonfile)

Use pandas and the json library:
import pandas as pd
import json
filepath = "inputfile.csv"
output_path = "outputfile.json"
df = pd.read_csv(filepath)
# Create a multiline json
json_list = json.loads(df.to_json(orient = "records"))
with open(output_path, 'w') as f:
for item in json_list:
f.write("%s\n" % item)

How about using Pandas to read the csv file into a DataFrame (pd.read_csv), then manipulating the columns if you want (dropping them or updating values) and finally converting the DataFrame back to JSON (pd.DataFrame.to_json).
Note: I haven't checked how efficient this will be but this is definitely one of the easiest ways to manipulate and convert a large csv to json.

As slight improvement to #MONTYHS answer, iterating through a tup of fieldnames:
import csv
import json
csvfilename = 'filename.csv'
jsonfilename = csvfilename.split('.')[0] + '.json'
csvfile = open(csvfilename, 'r')
jsonfile = open(jsonfilename, 'w')
reader = csv.DictReader(csvfile)
fieldnames = ('FirstName', 'LastName', 'IDNumber', 'Message')
output = []
for each in reader:
row = {}
for field in fieldnames:
row[field] = each[field]
output.append(row)
json.dump(output, jsonfile, indent=2, sort_keys=True)

def read():
noOfElem = 200 # no of data you want to import
csv_file_name = "hashtag_donaldtrump.csv" # csv file name
json_file_name = "hashtag_donaldtrump.json" # json file name
with open(csv_file_name, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
with open(json_file_name, 'w') as json_file:
i = 0
json_file.write("[")
for row in csv_reader:
i = i + 1
if i == noOfElem:
json_file.write("]")
return
json_file.write(json.dumps(row))
if i != noOfElem - 1:
json_file.write(",")
Change the above three parameter, everything will be done.

import csv
import json
csvfile = csv.DictReader('filename.csv', 'r'))
output =[]
for each in csvfile:
row ={}
row['FirstName'] = each['FirstName']
row['LastName'] = each['LastName']
row['IDNumber'] = each ['IDNumber']
row['Message'] = each['Message']
output.append(row)
json.dump(output,open('filename.json','w'),indent=4,sort_keys=False)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Converting a large CSV file to multiple JSON files using Python - python

Related

CSV to JSON Mass converter in Python

csv header in python only on the top row?

Converting JSON to CSV, CSV is empty

JSON like data to CSV file in python - not showing headers correctly

How to convert CSV file to multiline JSON?

Categories

Resources