I am trying to convert following CSV to JSON below. Any help will be appreciated.
Sample of CSV file (File would contain lot of network groups with network,host attributes)
Type,Value ,Name
Network,10.0.0.0/8,network_group_3
Host,10.0.0.27,network_group_3
Host,10.0.0.28,network_group_3
Network,10.10.10.0/24,network_group_4
Network,10.10.20.0/24,network_group_4
Host,10.10.10.6,network_group_4
Output in JSON Needed
netgroup =
"literals": [
{
"type": "Network",
"value": "10.0.0.0/8"
},
{
"type": "Host",
"value": "10.0.0.27"
},
{
"type": "Host",
"value": "10.0.0.28"
}
],
"name": "network_group_3"
},
{
"literals": [
{
"type": "Network",
"value": "10.10.10.0/24"
},
{
"type": "Network",
"value": "10.10.20.0/24"
},
{
"type": "Host",
"value": "10.0.0.6
}
],
"name": "network_group_4"
Here is a good explanation of Python for conerting CSV to JSON:
http://www.idiotinside.com/2015/09/18/csv-json-pretty-print-python/
Here is a solution using jq
If the file filter.jq contains
[
split("\n") # split string into lines
| (.[0] | split(",")) as $headers # split header
| (.[1:][] | split(",")) # split data rows
| select(length>0) # get rid of empty lines
]
| [
group_by(.[2])[]
| {
name: .[0][2],
literals: map({type:.[0], value:.[1]})
}
]
and your data is in a file called data then
jq -M -R -s -r -f filter.jq data
will generate
[
{
"name": "network_group_3",
"literals": [
{
"type": "Network",
"value": "10.0.0.0/8"
},
{
"type": "Host",
"value": "10.0.0.27"
},
{
"type": "Host",
"value": "10.0.0.28"
}
]
},
{
"name": "network_group_4",
"literals": [
{
"type": "Network",
"value": "10.10.10.0/24"
},
{
"type": "Network",
"value": "10.10.20.0/24"
},
{
"type": "Host",
"value": "10.10.10.6"
}
]
}
]
Late is better than never, so using convtools library:
from convtools import conversion as c
from convtools.contrib.tables import Table
# store converter somewhere if it needs to be reused
converter = (
c.group_by(c.item("Name"))
.aggregate(
{
"literals": c.ReduceFuncs.Array(
{
"type": c.item("Type"),
"value": c.item("Value"),
}
),
"name": c.item("Name"),
}
)
.gen_converter()
)
# iterable of rows and it can only be consumed once
rows = Table.from_csv("tmp2.csv", header=True).into_iter_rows(dict)
assert converter(rows) == [
{'literals': [{'type': 'Network', 'value': '10.0.0.0/8'},
{'type': 'Host', 'value': '10.0.0.27'},
{'type': 'Host', 'value': '10.0.0.28'}],
'name': 'network_group_3'},
{'literals': [{'type': 'Network', 'value': '10.10.10.0/24'},
{'type': 'Network', 'value': '10.10.20.0/24'},
{'type': 'Host', 'value': '10.10.10.6'}],
'name': 'network_group_4'}]
Related
I would like to store two-dimensional arrays of numbers in Avro.
I have tried the following:
{
"namespace": "com.company",
"type": "record",
"name": "MyName",
"doc" : "...",
"fields": [
{
"name": "MyArray",
"type": {
"type": "array",
"items": {
"type": {"type": "array","items": "int"}
}
}
}
]
}
But when I tried to read it with the parser:
import avro.schema
schema = avro.schema.parse(open("my_schema.avsc", "r").read())
I get the following error:
avro.errors.SchemaParseException: Type property "{'type': 'array', 'items': {'type': {'type': 'array', 'items': 'int'}}}"
not a valid Avro schema: Items schema ({'type': {'type': 'array', 'items': 'int'}}) not
a valid Avro schema: Undefined type: {'type': 'array', 'items': 'int'}
(known names: dict_keys(['com.algoint.ecg_frame_file.EcgFrameFile']))
It looks like you have one too many type keys.
You schema should be this instead:
{
"namespace": "com.company",
"type": "record",
"name": "MyName",
"doc" : "...",
"fields": [
{
"name": "MyArray",
"type": {
"type": "array",
"items": {"type": "array","items": "int"}
}
}
]
}
I need to flatten a JSON with different levels of nested JSON arrays in Python
Part of my JSON looks like:
{
"data": {
"workbooks": [
{
"projectName": "TestProject",
"name": "wkb1",
"site": {
"name": "site1"
},
"description": "",
"createdAt": "2020-12-13T15:38:58Z",
"updatedAt": "2020-12-13T15:38:59Z",
"owner": {
"name": "user1",
"username": "John"
},
"embeddedDatasources": [
{
"name": "DS1",
"hasExtracts": false,
"upstreamDatasources": [
{
"projectName": "Data Sources",
"name": "DS1",
"hasExtracts": false,
"owner": {
"username": "user2"
}
}
],
"upstreamTables": [
{
"name": "table_1",
"schema": "schema_1",
"database": {
"name": "testdb",
"connectionType": "redshift"
}
},
{
"name": "table_2",
"schema": "schema_2",
"database": {
"name": "testdb",
"connectionType": "redshift"
}
},
{
"name": "table_3",
"schema": "schema_3",
"database": {
"name": "testdb",
"connectionType": "redshift"
}
}
]
},
{
"name": "DS2",
"hasExtracts": false,
"upstreamDatasources": [
{
"projectName": "Data Sources",
"name": "DS2",
"hasExtracts": false,
"owner": {
"username": "user3"
}
}
],
"upstreamTables": [
{
"name": "table_4",
"schema": "schema_1",
"database": {
"name": "testdb",
"connectionType": "redshift"
}
}
]
}
]
}
]
}
}
The output should like this
sample output
Tried using json_normalize but couldn't make it work. Currently parsing it by reading the nested arrays using loops and reading values using keys. Looking for a better way of normalizing the JSON
Here's a partial solution:
First save your data in the same directory as the script as a JSON file called data.json.
import json
import pandas as pd
from pandas.io.json import json_normalize
with open('data.json') as json_file:
json_data = json.load(json_file)
new_data = json_data['data']['workbooks']
result = json_normalize(new_data, ['embeddedDatasources', 'upstreamTables'], ['projectName', 'name', 'createdAt', 'updatedAt', 'owner', 'site'], record_prefix='_')
result
Output:
_name
_schema
_database.name
_database.connectionType
projectName
name
createdAt
updatedAt
owner
site
0
table_1
schema_1
testdb
redshift
TestProject
wkb1
2020-12-13T15:38:58Z
2020-12-13T15:38:59Z
{'name': 'user1', 'username': 'John'}
{'name': 'site1'}
1
table_2
schema_2
testdb
redshift
TestProject
wkb1
2020-12-13T15:38:58Z
2020-12-13T15:38:59Z
{'name': 'user1', 'username': 'John'}
{'name': 'site1'}
2
table_3
schema_3
testdb
redshift
TestProject
wkb1
2020-12-13T15:38:58Z
2020-12-13T15:38:59Z
{'name': 'user1', 'username': 'John'}
{'name': 'site1'}
3
table_4
schema_1
testdb
redshift
TestProject
wkb1
2020-12-13T15:38:58Z
2020-12-13T15:38:59Z
{'name': 'user1', 'username': 'John'}
{'name': 'site1'}
What next?
I think if you re-structure the data a bit in advance (for example flattening 'database': {'name': 'testdb', 'connectionType': 'redshift'}) you will be able to add more fields to the meta parameter.
As you see in the documentation of json_normalize, the four parameters that are used here are:
data: dict or list of dicts :
Unserialized JSON objects.
record_path: str or list of str : default None
Path in each object to list of records. If not passed, data will be assumed to be an array of records.
meta: list of paths (str or list of str) : default None
Fields to use as metadata for each record in resulting table.
record_prefix: str : default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is [‘foo’, ‘bar’].
tl;dr: Your final output along with detailed steps are mentioned in here
details :
To answer this question you need to have a thorough understanding of pandas.json_normalize. The understanding of json_normalize, record_path, meta, explode and in general json parsing.
import json
import pandas as pd
data = {
"data":
{
"workbooks":
[
{
"projectName": "TestProject",
"name": "wkb1",
"site":
{
"name": "site1"
},
"description": "",
"createdAt": "2020-12-13T15:38:58Z",
"updatedAt": "2020-12-13T15:38:59Z",
"owner":
{
"name": "user1",
"username": "John"
},
"embeddedDatasources":
[
{
"name": "DS1",
"hasExtracts": False,
"upstreamDatasources":
[
{
"projectName": "Data Sources",
"name": "DS1",
"hasExtracts": False,
"owner":
{
"username": "user2"
}
}
],
"upstreamTables":
[
{
"name": "table_1",
"schema": "schema_1",
"database":
{
"name": "testdb",
"connectionType": "redshift"
}
},
{
"name": "table_2",
"schema": "schema_2",
"database":
{
"name": "testdb",
"connectionType": "redshift"
}
},
{
"name": "table_3",
"schema": "schema_3",
"database":
{
"name": "testdb",
"connectionType": "redshift"
}
}
]
},
{
"name": "DS2",
"hasExtracts": False,
"upstreamDatasources":
[
{
"projectName": "Data Sources",
"name": "DS2",
"hasExtracts": False,
"owner":
{
"username": "user3"
}
}
],
"upstreamTables":
[
{
"name": "table_4",
"schema": "schema_1",
"database":
{
"name": "testdb",
"connectionType": "redshift"
}
}
]
}
]
}
]
}
}
First you need to bring it to the dict level.
data_list = data['data']['workbooks']
I did some data massaging by renaming some columns as per requirements.
data_list_pd = pd.DataFrame(data_list)
data_list_pd = data_list_pd.rename(
columns= {'name':'wkb'},errors='ignore').rename(
columns= {'createdAt':'wkb_createdDt'},errors='ignore').rename(
columns= {'updatedAt':'wkb_updatedDt'},errors='ignore').rename(
columns= {'projectName':'prj'},errors='ignore')
data_list_pd
data_list = json.loads(data_list_pd.to_json(orient="records"))
data_list
Next is where the core of your problem statement lies. You need to flatten the JSON by mentioning the record_path which is esentially the nested dictionary you want to expand along with the meta which is meta data/the remaining columns which you want to display. After that you need to explode on columns which have lists in them. You can achieve it by chaining explode method couple of times.
flattened_dataframe= pd.json_normalize(data_list,
record_path = 'embeddedDatasources',
meta = ['prj','wkb','wkb_createdDt', 'wkb_updatedDt',['site','name'],['owner','name'],['owner','username']],
errors='ignore').explode('upstreamDatasources').explode('upstreamTables')
flattened_dataframe
You can repeat this process couple of times to reach your final goal/desired result. Since the json_normalize works on JSON/dict files you will have to convert the dataframe into json files after each iteration. You can follow these steps.
flattened_json = json.loads(flattened_dataframe.to_json(orient="records"))
Also read about to_json.
I want to merge many JSON files with the same nested structure, using jsonmerge, but have been unsuccessful so far. For example, I want to merge base and head:
base = {
"data": [
{
"author_id": "id1",
"id": "1"
},
{
"author_id": "id2",
"id": "2"
}
],
"includes": {
"users": [
{
"id": "user1",
"name": "user1"
},
{
"id": "user2",
"name": "user2"
}
]
}
}
head = {
"data": [
{
"author_id": "id3",
"id": "3"
},
{
"author_id": "id4",
"id": "4"
}
],
"includes": {
"users": [
{
"id": "user3",
"name": "user3"
},
{
"id": "user4",
"name": "user4"
}
]
}
}
The resulting JSON should be:
final_result = {
"data": [
{
"author_id": "id1",
"id": "1"
},
{
"author_id": "id2",
"id": "2"
},
{
"author_id": "id3",
"id": "3"
},
{
"author_id": "id4",
"id": "4"
}
],
"includes": {
"users": [
{
"id": "user1",
"name": "user1"
},
{
"id": "user2",
"name": "user2"
},
{
"id": "user3",
"name": "user3"
},
{
"id": "user4",
"name": "user4"
}
]
}
}
However, I've only managed to merge correctly the data fields, while for users it doesn't seem to work. This is my code:
from jsonmerge import merge
from jsonmerge import Merger
schema = { "properties": {
"data": {
"mergeStrategy": "append"
},
"includes": {
"users": {
"mergeStrategy": "append"
}
}
}
}
merger = Merger(schema)
result = merger.merge(base, head)
The end result is:
{'data': [{'author_id': 'id1', 'id': '1'},
{'author_id': 'id2', 'id': '2'},
{'author_id': 'id3', 'id': '3'},
{'author_id': 'id4', 'id': '4'}],
'includes': {'users': [{'id': 'user3', 'name': 'user3'},
{'id': 'user4', 'name': 'user4'}]}}
The issue is with the definition of the schema, but I do not know if it is possible to do it like that with jsonmerge. Any help is appreciated!
Thank you!
It is based on jsonschema. So when you have an object within an object (e.g. "users" within "includes") then you'll need to tell jsonschema it is dealing with another object like so:
schema = {
"properties": {
"data": {
"mergeStrategy": "append"
},
"includes": {
"type": "object",
"properties": {
"users": {
"mergeStrategy": "append"
}
}
}
}
}
Note that this also happens for your top-level objects, hence you have "properties" argument on the highest level.
{
"type": "Data",
"version": "1.0",
"box": {
"identifier": "abcdef",
"serial": "12345678"
},
"payload": {
"Type": "EL",
"Version": "1",
"Result": "Successful",
"Reference": null,
"Box": {
"Identifier": "abcdef",
"Serial": "12345678"
},
"Configuration": {
"EL": "1"
},
"vent": [
{
"ventType": "Arm",
"Timestamp": "2020-03-18T12:17:04+10:00",
"Parameters": [
{
"Name": "Arm",
"Value": "LT"
},
{
"Name": "Status",
"Value": "LD"
}
]
},
{
"ventType": "Arm",
"Timestamp": "2020-03-18T12:17:24+10:00",
"Parameters": [
{
"Name": "Arm",
"Value": "LT"
},
{
"Name": "Status",
"Value": "LD"
}
]
},
{
"EventType": "TimeUpdateCompleted",
"Timestamp": "2020-03-18T02:23:21.2979668Z",
"Parameters": [
{
"Name": "ActualAdjustment",
"Value": "PT0S"
},
{
"Name": "CorrectionOffset",
"Value": "PT0S"
},
{
"Name": "Latency",
"Value": "PT0.2423996S"
}
]
}
]
}
}
If you're looking to transfer information from a JSON file to a CSV, then you can use the following code to read in a JSON file into a dictionary in Python:
import json
with open('data.txt') as json_file:
data_dict = json.load(json_file)
You could then convert this dictionary into a list with either data_dict.items() or data_dict.values().
Then you just need to write this list to a CSV file which you can easily do by just looping through the list.
I have a project in which i have to convert a json file into a CSV file.
The Json sample :
{
"P_Portfolio Group": {
"depth": 1,
"dataType": "PortfolioOverview",
"levelId": "P_Portfolio Group",
"path": [
{
"label": "Portfolio Group",
"levelId": "P_Portfolio Group"
}
],
"label": "Portfolio Group",
"header": [
{
"id": "Label",
"label": "Security name",
"type": "text",
"contentType": "text"
},
{
"id": "SecurityValue",
"label": "MioCHF",
"type": "text",
"contentType": "number"
},
{
"id": "SecurityValuePct",
"label": "%",
"type": "text",
"contentType": "pct"
}
],
"data": [
{
"dataValues": [
{
"value": "Client1",
"type": "text"
},
{
"value": 2068.73,
"type": "number"
},
{
"value": 14.0584,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client2",
"type": "text"
},
{
"value": 1511.9,
"type": "number"
},
{
"value": 10.2744,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client3",
"type": "text"
},
{
"value": 1354.74,
"type": "number"
},
{
"value": 9.2064,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client4",
"type": "text"
},
{
"value": 1225.78,
"type": "number"
},
{
"value": 8.33,
"type": "pct"
}
]
}
],
"summary": [
{
"value": "Total",
"type": "text"
},
{
"value": 11954.07,
"type": "number"
},
{
"value": 81.236,
"type": "pct"
}
]
}
}
And i want o obtain something like:
Client1,2068.73,14.0584
Client2,1511.9,10.2744
Client3,871.15,5.92
Client4,11954.07,81.236
Can you please give me a hint.
import csv
import json
with open("C:\Users\SVC\Desktop\test.json") as file:
x = json.load(file)
f = csv.writer(open("C:\Users\SVC\Desktop\test.csv", "wb+"))
for x in x:
f.writerow(x["P_Portfolio Group"]["data"]["dataValues"]["value"])
but it doesn't work.
Can you please give me a hint.
import csv
import json
with open('C:\Users\SVC\Desktop\test.json') as json_file:
portfolio_group = json.load(json_file)
with open('C:\Users\SVC\Desktop\test.csv', 'w') as csv_file:
csv_obj = csv.writer(csv_file)
for data in portfolio_group['P_Portfolio Group']['data']:
csv_obj.writerow([d['value'] for d in data['dataValues']])
This results in the following C:\Users\SVC\Desktop\test.csv content:
Client1,2068.73,14.0584
Client2,1511.9,10.2744
Client3,1354.74,9.2064
Client4,1225.78,8.33
Use the pandas library:
import pandas as pd
data = pd.read_csv("C:\Users\SVC\Desktop\test.json")
data.to_csv('test.csv')
done