does `transform_lookup` save space? - python

I am trying to link several Altair charts that share aspects of the same data. I can do this by merging all the data into one data frame, but because of the nature of the data the merged data frame is much larger than is needed to have two separate data frames for each of the two charts. This is because the columns unique to each chart have many repeated rows for each entry in the shared column.
Would using transform_lookup save space over just using the merged data frame, or does transform_lookup end up doing the whole merge internally?

No, the entire dataset is still included in the vegaspec when you use transform_lookup. You can see this by printing the json spec of the charts you create. With the example from the docs:
import altair as alt
import pandas as pd
from vega_datasets import data
people = data.lookup_people().head(3)
people
name age height
0 Alan 25 180
1 George 32 174
2 Fred 39 182
groups = data.lookup_groups().head(3)
groups
group person
0 1 Alan
1 1 George
2 1 Fred
With pandas merge:
merged = pd.merge(groups, people, how='left',
left_on='person', right_on='name')
print(alt.Chart(merged).mark_bar().encode(
x='mean(age):Q',
y='group:O'
).to_json())
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
"config": {
"view": {
"continuousHeight": 300,
"continuousWidth": 400
}
},
"data": {
"name": "data-b41b97ffc89b39c92e168871d447e720"
},
"datasets": {
"data-b41b97ffc89b39c92e168871d447e720": [
{
"age": 25,
"group": 1,
"height": 180,
"name": "Alan",
"person": "Alan"
},
{
"age": 32,
"group": 1,
"height": 174,
"name": "George",
"person": "George"
},
{
"age": 39,
"group": 1,
"height": 182,
"name": "Fred",
"person": "Fred"
}
]
},
"encoding": {
"x": {
"aggregate": "mean",
"field": "age",
"type": "quantitative"
},
"y": {
"field": "group",
"type": "ordinal"
}
},
"mark": "bar"
}
With transform lookup all the data is there but as to separate dataset (so technically it takes a little bit of more space with the additional braces and the transform):
print(alt.Chart(groups).mark_bar().encode(
x='mean(age):Q',
y='group:O'
).transform_lookup(
lookup='person',
from_=alt.LookupData(data=people, key='name',
fields=['age'])
).to_json())
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
"config": {
"view": {
"continuousHeight": 300,
"continuousWidth": 400
}
},
"data": {
"name": "data-5fe242a79352d1fe243b588af570c9c6"
},
"datasets": {
"data-2b374d1509415e1d327c3a7521f8117c": [
{
"age": 25,
"height": 180,
"name": "Alan"
},
{
"age": 32,
"height": 174,
"name": "George"
},
{
"age": 39,
"height": 182,
"name": "Fred"
}
],
"data-5fe242a79352d1fe243b588af570c9c6": [
{
"group": 1,
"person": "Alan"
},
{
"group": 1,
"person": "George"
},
{
"group": 1,
"person": "Fred"
}
]
},
"encoding": {
"x": {
"aggregate": "mean",
"field": "age",
"type": "quantitative"
},
"y": {
"field": "group",
"type": "ordinal"
}
},
"mark": "bar",
"transform": [
{
"from": {
"data": {
"name": "data-2b374d1509415e1d327c3a7521f8117c"
},
"fields": [
"age",
"height"
],
"key": "name"
},
"lookup": "person"
}
]
}
When transform_lookup can save space is if you use it with the URLs of two dataset:
people = data.lookup_people.url
groups = data.lookup_groups.url
print(alt.Chart(groups).mark_bar().encode(
x='mean(age):Q',
y='group:O'
).transform_lookup(
lookup='person',
from_=alt.LookupData(data=people, key='name',
fields=['age'])
).to_json())
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
"config": {
"view": {
"continuousHeight": 300,
"continuousWidth": 400
}
},
"data": {
"url": "https://vega.github.io/vega-datasets/data/lookup_groups.csv"
},
"encoding": {
"x": {
"aggregate": "mean",
"field": "age",
"type": "quantitative"
},
"y": {
"field": "group",
"type": "ordinal"
}
},
"mark": "bar",
"transform": [
{
"from": {
"data": {
"url": "https://vega.github.io/vega-datasets/data/lookup_people.csv"
},
"fields": [
"age",
"height"
],
"key": "name"
},
"lookup": "person"
}
]
}

Related

Modify the value of a field of a specific nested object (its index) depending on a condition

I would like to modify the value of a field on a specific index of a nested type depending on another value of the same nested object or a field outside of the nested object.
As example, I have the current mapping of my index feed:
{
"feed": {
"mappings": {
"properties": {
"attacks_ids": {
"type": "keyword"
},
"created_by": {
"type": "keyword"
},
"date": {
"type": "date"
},
"groups_related": {
"type": "keyword"
},
"indicators": {
"type": "nested",
"properties": {
"date": {
"type": "date"
},
"description": {
"type": "text"
},
"role": {
"type": "keyword"
},
"type": {
"type": "keyword"
},
"value": {
"type": "keyword"
}
}
},
"malware_families": {
"type": "keyword"
},
"published": {
"type": "boolean"
},
"references": {
"type": "keyword"
},
"tags": {
"type": "keyword"
},
"targeted_countries": {
"type": "keyword"
},
"title": {
"type": "text"
},
"tlp": {
"type": "keyword"
}
}
}
}
}
Take the following document as example:
{
"took": 194,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "feed",
"_type": "_doc",
"_id": "W3CS7IABovFpcGfZjfyu",
"_score": 1,
"_source": {
"title": "Test",
"date": "2022-05-22T16:21:09.159711",
"created_by": "finch",
"tlp": "white",
"published": true,
"references": [
"test",
"test"
],
"tags": [
"tag1",
"tag2"
],
"targeted_countries": [
"Italy",
"Germany"
],
"malware_families": [
"family1",
"family2"
],
"groups_related": [
"group1",
"griup2"
],
"attacks_ids": [
""
],
"indicators": [
{
"value": "testest",
"description": "This is a test",
"type": "sha256",
"role": "file",
"date": "2022-05-22T16:21:09.159560"
},
{
"value": "testest2",
"description": "This is a test 2",
"type": "ipv4",
"role": "c2",
"date": "2022-05-22T16:21:09.159699"
}
]
}
}
]
}
}
I would like to make this update: indicators[0].value = 'changed'
if _id == 'W3CS7IABovFpcGfZjfyu'
or if title == 'some_title'
or if indicators[0].role == 'c2'
I already tried with a script, but it seems I can't manage to get it work, I hope the explanation is clear, ask any question if not, thank you.
Edit 1:
I managed to make it work, however it needs the _id, still looking for a way to do that without it.
My partial solution:
update = Pulse.get(id="XHCz7IABovFpcGfZWfz9") #Pulse is my document
update.update(script="for (indicator in ctx._source.indicators) {if (indicator.value=='changed2') {indicator.value='changed3'}}")
# Modify depending on the value of a field inside the same nested object

How can I generate a json file in pandas without including the index?

I have an input json file that looks roughly like this
[
{
"identifier": "116S5RJ63",
"containers": [
{
"contains": "soap",
"height": {
"unit": "FT",
"value": 12.07123829231181
},
"length": {
"unit": "FT",
"value": 12.07123829231181
},
"quantity": 1,
"weight": {
"unit": "volumeUnits",
"value": 10000
},
"width": {
"unit": "FT",
"value": 12.07123829231181
}
}
],{...}]
I read it in using
input_json = pd.read_json(input_json_file)
I then process the input_json a bit: nothing dramatic, just changing the contents of some fields. Next I try to output the json again as
input_json.to_json(output_file, orient='records', date_format='iso')
but the output looks like this
[
{
"index": 28741,
"identifier": "115JKLJVZ",
"containers": [
{
"contains": "soap",
"height": {
"unit": "FT",
"value": 12.07123829231181
},
"length": {
"unit": "FT",
"value": 12.07123829231181
},
"quantity": 1,
"weight": {
"unit": "volumeUnits",
"value": 10000
},
"width": {
"unit": "FT",
"value": 12.07123829231181
}
}
],{...}]
Specifically it now includes the field 'index', which I thought the orient='records' was supposed to deal with. I'm not sure what to do next. Any suggestions?
Try input_json.reset_index(drop=True, inplace=True) before saving it to file; this should drop the old index from being added as a column. reset_index documentation

Best way to build denormilazed dataframe with pandas from spotify API

I just downloaded some json from spotify and took a look into the pd.normalize_json().
But if I normalise the data i still have dictionaries within my dataframe. Also setting the level doesnt help.
DATA I want to have in my dataframe:
{
"collaborative": false,
"description": "",
"external_urls": {
"spotify": "https://open.spotify.com/playlist/5"
},
"followers": {
"href": null,
"total": 0
},
"href": "https://api.spotify.com/v1/playlists/5?additional_types=track",
"id": "5",
"images": [
{
"height": 640,
"url": "https://i.scdn.co/image/a",
"width": 640
}
],
"name": "Another",
"owner": {
"display_name": "user",
"external_urls": {
"spotify": "https://open.spotify.com/user/user"
},
"href": "https://api.spotify.com/v1/users/user",
"id": "user",
"type": "user",
"uri": "spotify:user:user"
},
"primary_color": null,
"public": true,
"snapshot_id": "M2QxNTcyYTkMDc2",
"tracks": {
"href": "https://api.spotify.com/v1/playlists/100&additional_types=track",
"items": [
{
"added_at": "2020-12-13T18:34:09Z",
"added_by": {
"external_urls": {
"spotify": "https://open.spotify.com/user/user"
},
"href": "https://api.spotify.com/v1/users/user",
"id": "user",
"type": "user",
"uri": "spotify:user:user"
},
"is_local": false,
"primary_color": null,
"track": {
"album": {
"album_type": "album",
"artists": [
{
"external_urls": {
"spotify": "https://open.spotify.com/artist/1dfeR4Had"
},
"href": "https://api.spotify.com/v1/artists/1dfDbWqFHLkxsg1d",
"id": "1dfeR4HaWDbWqFHLkxsg1d",
"name": "Q",
"type": "artist",
"uri": "spotify:artist:1dfeRqFHLkxsg1d"
}
],
"available_markets": [
"CA",
"US"
],
"external_urls": {
"spotify": "https://open.spotify.com/album/6wPXmlLzZ5cCa"
},
"href": "https://api.spotify.com/v1/albums/6wPXUJ9LzZ5cCa",
"id": "6wPXUmYJ9zZ5cCa",
"images": [
{
"height": 640,
"url": "https://i.scdn.co/image/ab676620a47",
"width": 640
},
{
"height": 300,
"url": "https://i.scdn.co/image/ab67616d0620a47",
"width": 300
},
{
"height": 64,
"url": "https://i.scdn.co/image/ab603e6620a47",
"width": 64
}
],
"name": "The (Deluxe ",
"release_date": "1920-07-17",
"release_date_precision": "day",
"total_tracks": 15,
"type": "album",
"uri": "spotify:album:6m5cCa"
},
"artists": [
{
"external_urls": {
"spotify": "https://open.spotify.com/artist/1dg1d"
},
"href": "https://api.spotify.com/v1/artists/1dsg1d",
"id": "1dfeR4HaWDbWqFHLkxsg1d",
"name": "Q",
"type": "artist",
"uri": "spotify:artist:1dxsg1d"
}
],
"available_markets": [
"CA",
"US"
],
"disc_number": 1,
"duration_ms": 21453,
"episode": false,
"explicit": false,
"external_ids": {
"isrc": "GBU6015"
},
"external_urls": {
"spotify": "https://open.spotify.com/track/5716J"
},
"href": "https://api.spotify.com/v1/tracks/5716J",
"id": "5716J",
"is_local": false,
"name": "Another",
"popularity": 73,
"preview_url": null,
"track": true,
"track_number": 3,
"type": "track",
"uri": "spotify:track:516J"
},
"video_thumbnail": {
"url": null
}
}
],
"limit": 100,
"next": null,
"offset": 0,
"previous": null,
"total": 1
},
"type": "playlist",
"uri": "spotify:playlist:fek"
}
So what are best practices to read nested data like this into one dataframe in pandas?
I'm glad for any advice.
EDIT:
so basically I want all keys as columns in my dataframe. But with normalise it stops at "tracks.items" and if I normalise this again i have the recursive problem again.
It depends on the information you are looking for. Take a look at pandas.read_json() to see if that can work. Also you can select data as such
json_output = {"collaborative": 'false',"description": "", "external_urls": {"spotify": "https://open.spotify.com/playlist/5"}}
df['collaborative'] = json_output['collaborative'] #set value of your df to value of returned json values

How to Remove Outer Layer of JSON in Python

I am trying to remove the outer (parent) layer of a JSON file so that I can process it, however I have no idea how.
As you will see by the code below, the outer 2 most layers are 2 dictionaries, however, python says the 2nd dictionary ("item") is just a string when I call its type. Am I incorrect in how I interpret the structure?
sample_object6 = {
"items":
{
"item":
[
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
},
{
"id": "0002",
"type": "donut",
"name": "Raised",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
},
{
"id": "0003",
"type": "donut",
"name": "Old Fashioned",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
},
{
"id": "0004",
"type": "bar",
"name": "Bar",
"ppu": 0.75,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
]
},
"topping":
[
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
],
"fillings":
{
"filling":
[
{ "id": "7001", "name": "None", "addcost": 0 },
{ "id": "7002", "name": "Custard", "addcost": 0.25 },
{ "id": "7003", "name": "Whipped Cream", "addcost": 0.25 }
]
}
},
{
"id": "0005",
"type": "twist",
"name": "Twist",
"ppu": 0.65,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
]
},
"topping":
[
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
]
},
{
"id": "0006",
"type": "filled",
"name": "Filled",
"ppu": 0.75,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
]
},
"topping":
[
{ "id": "5002", "type": "Glazed" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
],
"fillings":
{
"filling":
[
{ "id": "7002", "name": "Custard", "addcost": 0 },
{ "id": "7003", "name": "Whipped Cream", "addcost": 0 },
{ "id": "7004", "name": "Strawberry Jelly", "addcost": 0 },
{ "id": "7005", "name": "Rasberry Jelly", "addcost": 0 }
]
}
}
]
}
}
I thought that it might be possible to store the nested portion starting at the first list (right after 'item') in a variable and then work with this but if I can't get python to see that item is a dictionary inside the items dictionary, then I fear I am at a loss with how to proceed.
Does anyone know what I am doing wrong?
Thank you in advance!
As far as the processing goes, there has been none because I could not even get the string to read as a dictionary appropriately.
This is what I tried to test if it was a dictionary:
for i in sample_object6:
print(i + str(type(i)))
for n in i["item"]:
print(n + str(type(n)))
After submitting the same code that I thought I had already submitted, I noticed that python is interpreting the object correctly. I have some obvious fundamental gaps in how to work in python and I'm sorry I took it to the forum.
For the record (and for future python newbies out there like me), I used the following code which returned the proper class types:
#this returned a class type of dictionary
print(type(sample_object6["items"]))
#this returned a class type of list
print(type(sample_object6["items"]["item"]))
Thank you SungJin Steve Yoo & Pm2Ring for your help.

Json to CSV using python and blender 2.74

I have a project in which i have to convert a json file into a CSV file.
The Json sample :
{
"P_Portfolio Group": {
"depth": 1,
"dataType": "PortfolioOverview",
"levelId": "P_Portfolio Group",
"path": [
{
"label": "Portfolio Group",
"levelId": "P_Portfolio Group"
}
],
"label": "Portfolio Group",
"header": [
{
"id": "Label",
"label": "Security name",
"type": "text",
"contentType": "text"
},
{
"id": "SecurityValue",
"label": "MioCHF",
"type": "text",
"contentType": "number"
},
{
"id": "SecurityValuePct",
"label": "%",
"type": "text",
"contentType": "pct"
}
],
"data": [
{
"dataValues": [
{
"value": "Client1",
"type": "text"
},
{
"value": 2068.73,
"type": "number"
},
{
"value": 14.0584,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client2",
"type": "text"
},
{
"value": 1511.9,
"type": "number"
},
{
"value": 10.2744,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client3",
"type": "text"
},
{
"value": 1354.74,
"type": "number"
},
{
"value": 9.2064,
"type": "pct"
}
]
},
{
"dataValues": [
{
"value": "Client4",
"type": "text"
},
{
"value": 1225.78,
"type": "number"
},
{
"value": 8.33,
"type": "pct"
}
]
}
],
"summary": [
{
"value": "Total",
"type": "text"
},
{
"value": 11954.07,
"type": "number"
},
{
"value": 81.236,
"type": "pct"
}
]
}
}
And i want o obtain something like:
Client1,2068.73,14.0584
Client2,1511.9,10.2744
Client3,871.15,5.92
Client4,11954.07,81.236
Can you please give me a hint.
import csv
import json
with open("C:\Users\SVC\Desktop\test.json") as file:
x = json.load(file)
f = csv.writer(open("C:\Users\SVC\Desktop\test.csv", "wb+"))
for x in x:
f.writerow(x["P_Portfolio Group"]["data"]["dataValues"]["value"])
but it doesn't work.
Can you please give me a hint.
import csv
import json
with open('C:\Users\SVC\Desktop\test.json') as json_file:
portfolio_group = json.load(json_file)
with open('C:\Users\SVC\Desktop\test.csv', 'w') as csv_file:
csv_obj = csv.writer(csv_file)
for data in portfolio_group['P_Portfolio Group']['data']:
csv_obj.writerow([d['value'] for d in data['dataValues']])
This results in the following C:\Users\SVC\Desktop\test.csv content:
Client1,2068.73,14.0584
Client2,1511.9,10.2744
Client3,1354.74,9.2064
Client4,1225.78,8.33
Use the pandas library:
import pandas as pd
data = pd.read_csv("C:\Users\SVC\Desktop\test.json")
data.to_csv('test.csv')
done

Categories

Resources