I want to write dicts as gzipped json objects into a json file.
I had some solutions, but as the file got bigger the appending process got slower and slower.
So loading the file was not the way.
I found the solution here with:
def append_record_seek(data,filename):
print('append_record_seek started with data:{data} filename:{filename}')
with open (filename, mode="r+") as file:
file.seek(os.stat(filename).st_size -1)
file.write( ",]".format(json.dumps(data)) )
Later i want to read that file as a list of dicts.
Here is my minimal Code example:
import global_variables as gv
import time
import json as json
import base64
import io
import sys
import cv2
import gzip
import numpy as np
import os
from numpy import asarray
from json import JSONEncoder
data = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
path = r'C:/Videotool/Data'
name = 'test'
filename = path + '/' + name + '.json'
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
os.chdir(path)
def first_writer(data,filename):
print(f'first_writer started with data:{data} filename:{filename}')
with open (filename, 'w') as file:
file.write('[')
file.write(json.dumps(data))
file.write(',')
file.write(']')
def append_record_seek(data,filename):
print('append_record_seek started with data:{data} filename:{filename}')
with open (filename, mode="r+") as file:
file.seek(os.stat(filename).st_size -1)
file.write( ",]".format(json.dumps(data)) )
for x in range(6):
print(f'step:{x}')
file_exists = os.path.exists(name+'.json')
if file_exists:
print('file_exists')
append_record_seek(data,filename)
else:
print('else')
first_writer(data,filename)
the non zipped result should be looking like:
[{"brand": "Ford", "model": "Mustang", "year": 1964},
{"brand": "Ford", "model": "Mustang", "year": 1964},
{"brand": "Ford", "model": "Mustang", "year": 1964},
{"brand": "Ford", "model": "Mustang", "year": 1964},
{"brand": "Ford", "model": "Mustang", "year": 1964}]
My result is : [{"brand": "Ford", "model": "Mustang", "year": 1964},,,,,,]
If that works, i want to zip the dumps before writing.
I hope somebody can help
Update:
I've got the right Json format with:
def first_writer(data,filename):
print(f'first_writer started with data:{data} filename:{filename}')
with open (filename, 'w') as file:
file.write( "[{}]".format(json.dumps(data)) )
def append_record_seek(data,filename):
print('append_record_seek started with data:{data} filename:{filename}')
with open (filename, mode="r+") as file:
file.seek(os.stat(filename).st_size -1)
file.write( ",{}]".format(json.dumps(data)) )
Now i have to get that zipped...
NOTE: This is not the answer to the question, as there is none, this will just highlight that a single compressed file can be generated and decompressed later but it will not be valid json.
import gzip
from copy import copy
import json
# just test data
x = {
"brand": "Ford",
"model": "Mustang",
"year": 1964
}
z = {
"brand": "Mato",
"model": "Laatikko",
"year": 2023
}
l = []
# populate the initial "json" in the list l
for i in range(3):
y = copy(x)
y["year"] += i
l.append(y)
# write list of dicts as jsons string into file and compress it via gzip
# it doesnt really matter how this was originally done..
with open("data.gz", "wb") as f:
f.write(gzip.compress(bytes(json.dumps(l, indent=2),"utf-8")))
# then, append a new entry to the same file -- which will get uncompressed
# with the previously stored *valid* json structure..
with open("data.gz", "ab") as f:
f.write(gzip.compress(bytes(json.dumps(z, indent=2),"utf-8")))
This will result a file that looks like this when uncompressed
[
{
"brand": "Ford",
"model": "Mustang",
"year": 1964
},
{
"brand": "Ford",
"model": "Mustang",
"year": 1965
},
{
"brand": "Ford",
"model": "Mustang",
"year": 1966
}
]{
"brand": "Mato",
"model": "Laatikko",
"year": 2023
}
Related
I'm trying to update exsisting JSON file when running my code by adding additional data (package_id). this is the exsisting json contents:
{
"1": {
"age": 10,
"name": [
"ramsi",
"jack",
"adem",
"sara",
],
"skills": []
}
}
and I want to insert a new package and should looks like this:
{"1": {
"age": 10,
"name": [
"ramsi",
"jack",
"adem",
"sara",
],
"skills": []
} "2": {
"age": 14,
"name": [
"maya",
"raji",
],
"skills": ["writing"]
}
}
Issue is when I add the new data it adds --> ({) so (one top-level value) is added twice which is not allowed by JSON standards
{"1": {
"age": 10,
"name": [
"ramsi",
"jack",
"adem",
"sara",
],
"skills": []
}} {"2": {
"age": 14,
"name": [
"maya",
"raji",
],
"skills": ["writing"]
}
}
and this is my code to add the new (package_id):
list1[package_id] = {"age": x, "name": y, "skills": z}
ss = json.dumps(list1, indent=2)
data = []
with open('file.json', 'r+') as f:
data = json.loads(f.read())
data1 = json.dumps(data, indent=2)
f.seek(0)
f.write(data1)
f.write(ss)
f.truncate()
I write to the file twice because if I didn't store existing contents and write it again then it will remove old data and keeps only package_id number 2
It doesn't work that way. You can't add to a JSON record by appending another JSON record. A JSON file always has exactly one object. You need to modify that object.
with open('file.json','r') as f:
data = json.loads(f.read())
data[package_id] = {'age':x, 'name':y, 'skills':z}
with open('file.json','w') as f:
f.write(json.dumps(data,indent=2))
I am new to pyspark. I have a requirement where I need to convert a big CSV file at hdfs location to multiple Nested JSON files based on distinct primaryId.
Sample Input: data.csv
**PrimaryId,FirstName,LastName,City,CarName,DogName**
100,John,Smith,NewYork,Toyota,Spike
100,John,Smith,NewYork,BMW,Spike
100,John,Smith,NewYork,Toyota,Rusty
100,John,Smith,NewYork,BMW,Rusty
101,Ben,Swan,Sydney,Volkswagen,Buddy
101,Ben,Swan,Sydney,Ford,Buddy
101,Ben,Swan,Sydney,Audi,Buddy
101,Ben,Swan,Sydney,Volkswagen,Max
101,Ben,Swan,Sydney,Ford,Max
101,Ben,Swan,Sydney,Audi,Max
102,Julia,Brown,London,Mini,Lucy
Sample Output Files:
File1: Output_100.json
{
"100": [
{
"City": "NewYork",
"FirstName": "John",
"LastName": "Smith",
"CarName": [
"Toyota",
"BMW"
],
"DogName": [
"Spike",
"Rusty"
]
}
}
File2: Output_101.json
{
"101": [
{
"City": "Sydney",
"FirstName": "Ben",
"LastName": "Swan",
"CarName": [
"Volkswagen",
"Ford",
"Audi"
],
"DogName": [
"Buddy",
"Max"
]
}
}
File3: Output_102.json
{
"102": [
{
"City": "London",
"FirstName": "Julia",
"LastName": "Brown",
"CarName": [
"Mini"
],
"DogName": [
"Lucy"
]
}
]
}
Any quick help will be appreciated.
It seems you need to perform a group by on Id and collect Cars and Dogs as a set.
from pyspark.sql.functions import collect_set
df = spark.read.format("csv").option("header", "true").load("cars.csv")
df2 = (
df
.groupBy("PrimaryId","FirstName","LastName")
.agg(collect_set('CarName').alias('CarName'), collect_set('DogName').alias('DogName'))
)
df2.write.format("json").save("cars.json", mode="overwrite")
Generated files:
{"PrimaryId":"100","FirstName":"John","LastName":"Smith","CarName":["Toyota","BMW"],"DogName":["Spike","Rusty"]}
{"PrimaryId":"101","FirstName":"Ben","LastName":"Swan","CarName":["Ford","Volkswagen","Audi"],"DogName":["Max","Buddy"]}
{"PrimaryId":"102","FirstName":"Julia","LastName":"Brown","CarName":["Mini"],"DogName":["Lucy"]}
Let me know if this is what you are looking for.
You can use pandas.groupby() to group by the Id and then iterate over the DataFrameGroupBy object creating the objects and writing the files.
You need to install pandas by $ pip install pandas to your virtualenv.
# coding: utf-8
import json
import pandas as pd
def group_csv_columns(csv_file):
df = pd.read_csv(csv_file)
group_frame = df.groupby(['PrimaryId'])
for i in group_frame:
data_frame = i[1]
data = {}
data[i[0]] = [{
"City": data_frame['City'].unique().tolist()[0],
"FirstName": data_frame['FirstName'].unique().tolist()[0],
"CarName": data_frame['CarName'].unique().tolist(),
'DogName': data_frame['DogName'].unique().tolist(),
'LastName': data_frame['LastName'].unique().tolist()[0],
}]
# Write to file
file_name = 'Output_' + str(i[0]) + '.json'
with open(file_name, 'w') as fh:
contents = json.dumps(data)
fh.write(contents)
group_csv_columns('/tmp/sample.csv')
Call the group_csv_columns() with the file_name with the csv contents.
Check the pandas docs
I have a json file saved in local server, such as:
{
"user": "user1",
"id": "21779"
}
and I want to write another dict into this json file, I need new content like this:
{
{
"user": "user1",
"id": "21779"
},
{
"user": "user2",
"id": "21780"
}
}
Or:
[
{
"user": "user1",
"id": "21779"
},
{
"user": "user2",
"id": "21780"
}
]
I try to use json.dump() to add the new element, but is displayed as:
{
"user": "user1",
"id": "21779"
}
{
"user": "user2",
"id": "21780"
}
It is not a valid json file.
How can I do use json.dump, json.load, or other methods?
Thanks for help me!
You have to read your JSON file and then convert it to list instead of dict. Then you just need to append to that list and overwrite your JSON file.
import json
data = json.load(open('data.json'))
# convert data to list if not
if type(data) is dict:
data = [data]
# append new item to data lit
data.append({
"user": "user2",
"id": "21780"
})
# write list to file
with open('data.json', 'w') as outfile:
json.dump(data, outfile)
You can do with the list not with the dict , try the below one solution if its help
import json
def appendList():
with open("test.json", mode='r', encoding='utf-8') as f:
feeds = json.load(f)
print(feeds)
with open("test.json", mode='w', encoding='utf-8') as feedsjson:
entry = { "user": "user3","id": "21574"}
feeds.append(entry)
print(json.dump(feeds, feedsjson))
I am trying to convert JSON data into a CSV in Python and found this code listed on Stack Exchange from a while back (link:How can I convert JSON to CSV?)
It no longer works in Python 3, giving me different errors. Anyone know how to fix for Python 3? Thanks.
import csv
import json
x = """[
{
"pk": 22,
"model": "auth.permission",
"fields": {
"codename": "add_logentry",
"name": "Can add log entry",
"content_type": 8
}
},
{
"pk": 23,
"model": "auth.permission",
"fields": {
"codename": "change_logentry",
"name": "Can change log entry",
"content_type": 8
}
},
{
"pk": 24,
"model": "auth.permission",
"fields": {
"codename": "delete_logentry",
"name": "Can delete log entry",
"content_type": 8
}
}
]"""
x = json.loads(x)
f = csv.writer(open("test.csv", "wb+"))
# Write CSV Header, If you dont need that, remove this line
f.writerow(["pk", "model", "codename", "name", "content_type"])
for x in x:
f.writerow([x["pk"],
x["model"],
x["fields"]["codename"],
x["fields"]["name"],
x["fields"]["content_type"]])
You were opening the file as binary with wb+ whereas you are trying to write str.
f = csv.writer(open("test.csv", "w+"))
Maybe this code will help you in some way to understand that.
import json,csv
data = []
with open('your_json_file_here.json') as file:
for line in file:
data.append(json.loads(line)) length = len(data)
with open('create_new_file.csv','w') as f:
writer = csv.writer(f)
writers = csv.DictWriter(f, fieldnames=['header1','header2'])
writers.writeheader()
for iter in range(length):
writer.writerow((data[iter]['specific_col_name1'],data[iter]['specific_col_name2'])) f.close()
Here is the sample JSON which I am trying to parse in python.
I am having hard time parsing through "files":
Any help appreciated.
{
"startDate": "2016-02-19T08:19:30.764-0700",
"endDate": "2016-02-19T08:20:19.058-07:00",
"files": [
{
"createdOn": "2017-02-19T08:20:19.391-0700",
"contentType": "text/plain",
"type": "jpeg",
"id": "Photoshop",
"fileUri": "output.log"
}
],
"status": "OK",
"linkToken": "-3418849688029673541",
"subscriberViewers": [
"null"
]
}
To print the id of each file in the array:
import json
data = json.loads(rawdata)
files = data['files']
for f in files:
print(f['id'])