How to get specific json value - python - python

I am migrating my code from java to python, but I am still having some difficulties understanding how to fetch a specific path in json using python.
This is my Java code, which returns a list of accountsId.
public static List < String > v02_JSON_counterparties(String date) {
baseURI = "https://cdwp/cdw";
String counterparties =
given()
.auth().basic(getJiraUser(), getJiraPass())
.param("limit", "1000000")
.param("count", "false")
.when()
.get("/counterparties/" + date).body().asString();
List < String > accountId = extract_accountId(counterparties);
return accountId;
}
public static List < String > extract_accountId(String res) {
List < String > ids = JsonPath.read(res, "$..identifier[?(#.accountIdType == 'ACCOUNTID')].accountId");
return ids;
}
And this is the json structure where I am getting the accountID.
{
'organisationId': {
'#value': 'MHI'
},
'accountName': 'LAZARD AM DEUT AC LF1632',
'identifiers': {
'accountId': 'LAZDLF1632',
'customerId': 'LAZAMDEUSG',
'blockAccountCode': 'LAZDEUBDBL',
'bic': 'LAMDDEF1XXX',
'identifier': [{
'accountId': 'MHI',
'accountIdType': 'REVNCNTR'
}, {
'accountId': 'LAZDLF1632',
'accountIdType': 'ACCOUNTID'
}, {
'accountId': 'LAZAMDEUSG',
'accountIdType': 'MHICUSTID'
}, {
'accountId': 'LAZDEUBDBL',
'accountIdType': 'BLOCKACCOUNT'
}, {
'accountId': 'LAMDDEF1XXX',
'accountIdType': 'ACCOUNTBIC'
}, {
'accountId': 'LAZDLF1632',
'accountIdType': 'GLOBEOP'
}]
},
'isBlocAccount': 'N',
'accountStatus': 'COMPLETE',
'products': {
'productType': [{
'productLineName': 'CASH',
'productTypeId': 'PRODMHI1',
'productTypeName': 'Bond, Equity,Convertible Bond',
'cleared': 'N',
'bilateral': 'N',
'limitInstructions': {
'limitInstruction': [{
'limitAmount': '0',
'limitCurrency': 'GBP',
'limitType': 'PEAEXPLI',
'limitTypeName': 'Cash-Peak Exposure Limit'
}]
}
}]
},
'etc': {
'addressGeneral': 'LZFLUS33XXX',
'addressAccount': 'LF1632',
'tradingLevel': 'B'
},
'clientBroker': 'C',
'costCentre': 'Credit Sales',
'clientLevel': 'SUBAC',
'accountCreationDate': '2016-10-19T00:00:00.000Z',
'accountOpeningDate': '2016-10-19T00:00:00.000Z'
}
This is my code in Python
import json, requests, urllib.parse, re
from pandas.io.parsers import read_csv
import pandas as pd
from termcolor import colored
import numpy as np
from glob import glob
import os
# Set Up
dateinplay = "2021-09-27"
#Get accountId
cdwCounterparties = (
f"http://cdwu/cdw/counterparties/?limit=1000000?yyyy-mm-dd={dateinplay}"
)
r = json.loads(requests.get(cdwCounterparties).text)
account_ids = [i['accountId'] for i in data['identifiers']['identifier']if i['accountIdType']=="ACCOUNTID"]
I am getting this error when I try to fetch the accountId:
Traceback (most recent call last):
File "h:\DESKTOP\test_check\checkCounterpartie.py", line 54, in <module>
account_ids = [i['accountId'] for i in data['identifiers']['identifier']if i['accountIdType']=="ACCOUNTID"]
TypeError: list indices must be integers or slices, not str

If I'm inerpeting your question correctly you want all ids where accountistype is "ACCOUNTID".
this give you that:
account_ids = [i['accountId'] for i in data['identifiers']['identifier']if i['accountIdType']=="ACCOUNTID"]

accs = {
"identifiers": {
...
account_id_list = []
for acc in accs.get("identifiers", {}).get("identifier", []):
account_id_list.append(acc.get("accountId", ""))
creates a list called account_id_list which is
['MHI', 'DKEPBNPGIV', 'DKEPLLP SG', 'DAVKEMEQBL', '401821', 'DKEPGB21XXX', 'DKEPBNPGIV', 'DKPARTNR']

assuming you store the dictionary (json structure) in variable x, getting all accountIDs is something like:
account_ids = [i['accountId'] for i in x['identifiers']['identifier']]

I'd like to thank you all for your answers. It helped me a lot to find a resolution to my problem.
Below is how I solved it.
listAccountId = []
cdwCounterparties = (
f"http://cdwu/cdw/counterparties/?limit=100000?yyyy-mm-dd={dateinplay}"
)
r = requests.get(cdwCounterparties).json()
jsonpath_expression = parse("$..accounts.account[*].identifiers.identifier[*]")
for match in jsonpath_expression.find(r):
# print(f'match id: {match.value}')
thisdict = match.value
if thisdict["accountIdType"] == "ACCOUNTID":
# print(thisdict["accountId"])
listAccountId.append(thisdict["accountId"])

Related

Parsing JSON (String v Int indices)

I'll try to explain the problem as succinctly as possible. I'm trying to filter some values from a log file coming from Elastic. The log outputs this JSON exactly:
{'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': None, 'hits': [{'_index': 'winlogbeat-dc-2022.10.17-000014', '_type': '_doc', '_id': 'vOCnfoQBeS2JF7giMG9q', '_score': None, '_source': {'agent': {'hostname': 'SRVDC1'}, '#timestamp': '2022-11-16T04:19:13.622Z'}, 'sort': [-9223372036854775808]}, {'_index': 'winlogbeat-dc-2022.10.17-000014', '_type': '_doc', '_id': 'veCnfoQBeS2JF7giMG9q', '_score': None, '_source': {'agent': {'hostname': 'SRVDC1'}, '#timestamp': '2022-11-16T04:19:13.630Z'}, 'sort': [-9223372036854775808]}]}}
Now, I want to filter out only the _index and #timestamp keys. If I assign this JSON to a variable, I can perfectly filter out the two keys by running:
index = (data['hits']['hits'][0]['_index'])
timestamp = (data['hits']['hits'][0]['_source']['#timestamp'])
Output:
winlogbeat-dc*
2022-11-16T04:19:13.622Z
However, if I try to do the same directly from the server call, I get:
Traceback (most recent call last):
File "c:\Users\user\Desktop\PYTHON\tiny2.py", line 96, in <module>
query()
File "c:\Users\user\Desktop\PYTHON\tiny2.py", line 77, in query
index = (final_data['hits']['hits'][0]['_index'])
TypeError: string indices must be integers
Now, I understand the it's asking for integer values instead of the strings I'm using, but if I use integers, then I get individual characters rather than a key/value pair.
What am I missing?
UPDATE:
Below is the entire code, but it won't help much. It contains Elastic's DSL query language, and a call to the server, which obviously you won't be able to connect to.
I tried your suggestions, but I either get the same error, or a new one:
raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not ObjectApiResponse
Entire code as follows:
import os
import ast
import csv
import json
from elasticsearch import Elasticsearch
from datetime import datetime,timedelta
import datetime
ELASTIC_USERNAME = 'elastic'
ELASTIC_PASSWORD = "abc123"
PORT= str('9200')
HOST = str('10.20.20.131')
CERT = os.path.join(os.path.dirname(__file__),"cert.crt")
initial_time = datetime.datetime.now()
past_time = datetime.datetime.now() - (timedelta(minutes=15))
def query():
try: #connection to Elastic server
es = Elasticsearch(
"https://10.20.20.131:9200",
ca_certs = CERT,
verify_certs=False,
basic_auth = (ELASTIC_USERNAME, ELASTIC_PASSWORD)
)
except ConnectionRefusedError as error:
print("[-] Connection error")
else: #DSL Elastic query of Domain Controler logs
query_res = es.search(
index="winlogbeat-dc*",
body={
"size": 3,
"sort": [
{
"timestamp": {
"order": "desc",
"unmapped_type": "boolean"
}
}
],
"_source": [
"agent.hostname",
"#timestamp"
],
"query": {
"bool": {
"must": [],
"filter": [
{
"range": {
"#timestamp": {
"format": "strict_date_optional_time",
"gte": f'{initial_time}',
"lte": f'{past_time}'
}
}
}
],
"should": [],
"must_not": []
}
}
}
)
if query_res:
parse_to_json =json.loads(query_res)
final_data = json.dumps(str(parse_to_json))
index = ast.literal_eval(final_data)['hits']['hits'][0]['_index']
timestamp = ast.literal_eval(final_data)['hits']['hits'][0]['_source']['#timestamp']
columns = ['Index','Last Updated']
rows = [[f'{index}',f'{timestamp}']]
with open("final_data.csv", 'w') as csv_file:
write_to_csv = csv.writer(csv_file)
write_to_csv.writerow(columns)
write_to_csv.writerows(rows)
print("CSV file created!")
else:
print("Log not found")
query()
If you're really getting ' in your response, use this:
import ast
...
index = ast.literal_eval(final_data)['hits']['hits'][0]['_index']
Otherwise use this:
import json
...
index = json.loads(final_data)['hits']['hits'][0]['_index']
Elasticsearch returns an ObjectApiResponse so you have to parse the _source field:
import json
final_data = json.loads(query_res["_source"])
index = final_data['hits']['hits'][0]['_index']
I'm not sure why you surround with parenthesis the indexing selection.
I struggle to make sense of this:
query_res = es.search(...)
if query_res:
parse_to_json =json.loads(query_res)
final_data = json.dumps(str(parse_to_json))
index = ast.literal_eval(final_data)['hits']['hits'][0]['_index']
timestamp = ast.literal_eval(final_data)['hits']['hits'][0]['_source']['#timestamp']
query_res is an instance of ObjectApiResponse, and you can get data from it like a dictionary right away. Instead you perform a sequence of converting object to string and back again, and then "stringify" it once more, with unpredictable results.
Just do it like they do in ES docs:
first_hit = query_res['hits']['hits'][0]
index = first_hit['_index']
timestamp = first_hit['_source']['#timestamp']
I was able to fix the problem by running a broad query first with
query_res['hits']['hits']
And then run a for loop with the specific time range I needed.
Here is the code:
for query_res in query_res['hits']['hits']:
winlogbeat_dc_timestamp = query_res['_source']['#timestamp']
Then another issue arose. I needed to convert datetime format into a string:
#Convert datetime to string
pattern = '%Y-%m-%dT%H:%M:%S.%fZ'
dt = datetime.strptime(winlogbeat_dc_timestamp, pattern)
new_timestamp = str(dt + timedelta(hours=1))[:-3] + 'Z'
And finally format it to a more readable pattern:
#Format timestamp to a more readable pattern
formatted_time = (''.join(letter for letter in new_timestamp if not letter.isalpha()))[:19]
formatted_time2 = formatted_time[:10] + ' / ' + formatted_time[10:]

How do I get the value of a dict item within a list, within a dict?

How do I get the value of a dict item within a list, within a dict in Python? Please see the following code for an example of what I mean.
I use the following lines of code in Python to get data from an API.
res = requests.get('https://api.data.amsterdam.nl/bag/v1.1/nummeraanduiding/', params)
data = res.json()
data then returns the following Python dictionary:
{
'_links': {
'next': {
'href': null
},
'previous': {
"href": null
},
'self': {
'href': 'https://api.data.amsterdam.nl/bag/v1.1/nummeraanduiding/'
}
},
'count': 1,
'results': [
{
'_display': 'Maple Street 99',
'_links': {
'self': {
'href': 'https://api.data.amsterdam.nl/bag/v1.1/nummeraanduiding/XXXXXXXXXXXXXXXX/'
}
},
'dataset': 'bag',
'landelijk_id': 'XXXXXXXXXXXXXXXX',
'type_adres': 'Hoofdadres',
'vbo_status': 'Verblijfsobject in gebruik'
}
]
}
Using Python, how do I get the value for 'landelijk_id', represented by the twelve Xs?
This should work:
>>> data['results'][0]['landelijk_id']
"XXXXXXXXXXXXXXXX"
You can just chain those [] for each child you need to access.
I'd recommend using the jmespath package to make handling nested Dictionaries easier. https://pypi.org/project/jmespath/
import jmespath
import requests
res = requests.get('https://api.data.amsterdam.nl/bag/v1.1/nummeraanduiding/', params)
data = res.json()
print(jmespath.search('results[].landelijk_id', data)

How to create the given jSON format from a pandas dataframe?

The data looks like this :
The expected Json fomat is like this
{
"DataExtractName": "SalesDataExtract",
"BusinessName" : {
"InvoiceDate": {
"SourceSystem": {
"MYSQL" : "Invc_Dt",
"CSV" : "Invc_Date"
},
"DataType": {
"MYSQL" : "varchar",
"CSV" : "string"
}
},
"Description": {
"SourceSystem": {
"MYSQL" : "Prod_Desc",
"CSV" : "Prod_Descr"
},
"DataType": {
"MYSQL" : "varchar",
"CSV" : "string"
}
}
}
},
{
"DataExtractName": "DateDataExtract",
"BusinessName" : {
"InvoiceDate": {
"SourceSystem": {
"MYSQL" : "Date"
},
"DataType": {
"MYSQL" : "varchar"
}
}
}
}
How do i achieve this using python dataframes? Or do i need to write some script to make the data like this?
Note
I've tried using -
df.to_json
df.to_dict
With so many nested structures, you should use marshmallow. It is built with your use case in mind. Please check out the excellent documentation: https://marshmallow.readthedocs.io/en/stable/ . All you need is the masic usage.
It is a lot of code, but better be explicit than clever. I am sure a shorter solution exists, but it is probably unmaintainable. Also I had to build your dataframe. Please provide it in a data format next time.
import pandas as pd
import marshmallow as ma
# build test data
df = pd.DataFrame.from_records([
['InvoiceDate', 'MYSQL', 'Invc_Dt', 'varchar', 'SalesDataExtract'],
['InvoiceDate', 'CSV', 'Invc_Date', 'string', 'SalesDataExtract'],
['Description', 'MYSQL', 'Prod_Descr', 'varchar', 'SalesDataExtract'],
['Description', 'CSV', 'Prod_Descr', 'string', 'SalesDataExtract'],
['InvoiceDate', 'MYSQL', 'Date', 'varchar', 'DateDataExtract'],]
)
df.columns = ['BusinessName', 'SourceSystem', 'FunctionalName', 'DataType', 'DataExtractName']
# define marshmallow schemas
class SourceSystemTypeSchema(ma.Schema):
MYSQL = ma.fields.String()
CSV = ma.fields.String()
class DataTypeSchema(ma.Schema):
MYSQL = ma.fields.String()
CSV = ma.fields.String()
class InvoiceDateSchema(ma.Schema):
InvoiceDate = ma.fields.Nested(SourceSystemTypeSchema())
DataType = ma.fields.Nested(DataTypeSchema())
class DescriptionSchema(ma.Schema):
SourceSystem = ma.fields.Nested(SourceSystemTypeSchema())
DataType = ma.fields.Nested(DataTypeSchema())
class BusinessNameSchema(ma.Schema):
InvoiceDate = ma.fields.Nested(InvoiceDateSchema())
Description = ma.fields.Nested(DescriptionSchema())
class DataSchema(ma.Schema):
DataExtractName = ma.fields.String()
BusinessName = ma.fields.Nested(BusinessNameSchema())
# building json
result = []
mask_business_name_invoicedate = df.BusinessName == 'InvoiceDate'
mask_business_name_description = df.BusinessName == 'Description'
for data_extract_name in set(df['DataExtractName'].to_list()):
mask_data_extract_name = df.DataExtractName == data_extract_name
# you need these two helper dfs to get the dictionaries
df_source_system = df[mask_data_extract_name & mask_business_name_invoicedate].set_index('SourceSystem').to_dict(orient='dict')
df_description = df[mask_data_extract_name & mask_business_name_description].set_index('SourceSystem').to_dict(orient='dict')
# all dictionaries are defined, so you can use your schemas
source_system_type = SourceSystemTypeSchema().dump(df_source_system['FunctionalName'])
data_type = DataTypeSchema().dump(df_source_system['DataType'])
source_system = SourceSystemTypeSchema().dump(df_description['FunctionalName'])
invoice_date = InvoiceDateSchema().dump({'SourceSystemType': source_system_type, 'DataType': data_type})
description = DescriptionSchema().dump({'SourceSystem': source_system, 'DataType': data_type})
business_name = BusinessNameSchema().dump({'InvoiceDate': invoice_date, 'Description': description})
data = DataSchema().dump({'DataExtractName': data_extract_name, 'BusinessName': business_name})
# end result
result.append(data)
Now,
ma.pprint(result)
returns
[{'BusinessName': {'Description': {'DataType': {'CSV': 'string',
'MYSQL': 'varchar'},
'SourceSystem': {'CSV': 'Prod_Descr',
'MYSQL': 'Prod_Descr'}},
'InvoiceDate': {'DataType': {'CSV': 'string',
'MYSQL': 'varchar'}}},
'DataExtractName': 'SalesDataExtract'},
{'BusinessName': {'Description': {'DataType': {'MYSQL': 'varchar'},
'SourceSystem': {}},
'InvoiceDate': {'DataType': {'MYSQL': 'varchar'}}},
'DataExtractName': 'DateDataExtract'}]

AWS Glue: How to expand nested Hive struct to Dict?

I'm trying to expand field mappings in a Table mapped by my AWS Glue crawler to a nested dictionary in Python. But, I can't find any Spark/Hive parsers to deserialize the
var_type = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
string located in table_schema['Table']['StorageDescriptor']['Columns'] to a Python dict.
How to dump the table definition in Glue:
import boto3
client = boto3.client('glue')
client.get_table(DatabaseName=selected_db, Name=selected_table)
Response:
table_schema = {'Table': {'Name': 'asdfasdf',
'DatabaseName': 'asdfasdf',
'Owner': 'owner',
'CreateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'UpdateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'LastAccessTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'Retention': 0,
'StorageDescriptor': {'Columns': [{'Name': 'version', 'Type': 'int'},
{'Name': 'payload',
'Type': 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'},
{'Name': 'origin', 'Type': 'string'}],
'Location': 's3://asdfasdf/',
'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
'Compressed': False,
'NumberOfBuckets': -1,
'SerdeInfo': {'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe',
'Parameters': {'paths': 'origin,payload,version'}},
'BucketColumns': [],
'SortColumns': [],
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'StoredAsSubDirectories': False},
'PartitionKeys': [{'Name': 'partition_0', 'Type': 'string'},
{'Name': 'partition_1', 'Type': 'string'},
{'Name': 'partition_2', 'Type': 'string'}],
'TableType': 'EXTERNAL_TABLE',
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'CreatedBy': 'arn:aws:sts::asdfasdf'},
'ResponseMetadata': {'RequestId': 'asdfasdf',
'HTTPStatusCode': 200,
'HTTPHeaders': {'date': 'Thu, 01 Aug 2019 16:23:06 GMT',
'content-type': 'application/x-amz-json-1.1',
'content-length': '3471',
'connection': 'keep-alive',
'x-amzn-requestid': 'asdfasdf'},
'RetryAttempts': 0}}
Goal would be a python dictionary and values for each field type, vs. the embedded string. E.g.
expand_function('struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'})
returns
{
'loc_lat':'double',
'service_handler':'string',
'ip_address':'string',
'device':'bigint',
'source':{'id':'string',
'contacts': {
'admin': {
'email':'string',
'name':'string'
},
'name':'string'
},
'loc_name':'string'
}
Thanks!
The accepted answer doesn't handle arrays.
This solution does:
import json
import re
def _hive_struct_to_json(hive_str):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
r = re.compile(r'(.*?)(struct<|array<|[:,>])(.*)')
root = dict()
to_parse = hive_str
parents = []
curr_elem = root
key = None
while to_parse:
left, operator, to_parse = r.match(to_parse).groups()
if operator == 'struct<' or operator == 'array<':
parents.append(curr_elem)
new_elem = dict() if operator == 'struct<' else list()
if key:
curr_elem[key] = new_elem
curr_elem = new_elem
elif isinstance(curr_elem, list):
curr_elem.append(new_elem)
curr_elem = new_elem
key = None
elif operator == ':':
key = left
elif operator == ',' or operator == '>':
if left:
if isinstance(curr_elem, dict):
curr_elem[key] = left
elif isinstance(curr_elem, list):
curr_elem.append(left)
if operator == '>':
curr_elem = parents.pop()
return root
hive_str = '''
struct<
loc_lat:double,
service_handler:string,
ip_address:string,
device:bigint,
source:struct<
id:string,
contacts:struct<
admin:struct<
email:string,
name:array<string>
>
>,
name:string
>,
loc_name:string,
tags:array<
struct<
key:string,
value:string
>
>
>
'''
hive_str = re.sub(r'[\s]+', '', hive_str).strip()
print(hive_str)
print(json.dumps(_hive_struct_to_json(hive_str), indent=2))
Prints:
struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:array<string>>>,name:string>,loc_name:string,tags:array<struct<key:string,value:string>>>
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": [
"string"
]
}
},
"name": "string"
},
"loc_name": "string",
"tags": [
{
"key": "string",
"value": "string"
}
]
}
Here's a function running on the embedded Hive struct string above.
def _hive_struct_to_json(hive_struct):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
# Convert embedded hive type definition string to JSON
hive_struct = hive_struct.replace(':', '":"')
hive_struct = hive_struct.replace(',', '","')
hive_struct = hive_struct.replace('struct<', '{"')
hive_struct = hive_struct.replace('"{"', '{"')
hive_struct = hive_struct.replace('>', '"}')
hive_struct = hive_struct.replace('}"', '}')
return json.loads(hive_struct)
hive_str = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
print(json.dumps(_hive_struct_to_json(hive_str),indent=2))
Returns:
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": "string"
}
},
"name": "string"
},
"loc_name": "string"
}
I tried to scout from some existing ways and found some helper functions from pyspark.
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tmp").getOrCreate()
struct_map = T._parse_datatype_string("MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>")
struct_map is a pyspark type that inturn has nested fields to iterate over. Once you have an object like above, performing a recursive call to flatten it should be easy. I am open to hearing opinions from others about this approach.

Replace value in JSON file for key which can be nested by n levels

I have JSON that looks like this:
{
"ROLE_NAME": {
"FOO": {
"download_url": "http: //something.staging/12345/buzz.zip"
},
"BAR": {
"download_url": "http: //something.staging/12345/fizz.zip"
},
"download_url": "http: //something.staging/12345/fizzbuzz.zip",
"db_name": "somedb",
"db_server": "dbserver.staging.dmz",
"plugin": {
"server_url": "http: //lab.staging.corp/server/"
}
}
}
I wrote a bit of python that replaces the "download_url" k:v with a new value (i.e. new download_url). Unfortunately it only replaces one of the three download_urls in that json snippet. I understand why, but am having a little difficulty getting the solution, and so I am here asking for help.
The entire json object is "data"
So I do something like this:
data["ROLE_NAME"]["download_url"] = download_url
Where download_url is a new value I have assigned to that variable
What I need to do is for any key called ["download_url"] then update it, rather than the one I have specified at the layer I am going to.
Some of my code to help:
I take some values obtained earlier in my code and build a url which returns a response. I extract a value from the response which will be used to build the value of download_url
buildinfo_url = "http://something.staging/guestAuth/app/rest/builds/?locator=buildType:%s,tags:%s,branch:branched:any" % (
bt_number,
list_json_load[role_name][0]['tag']
)
Send HTTP request
client = httplib2.Http()
response, xml = client.request(buildinfo_url)
Extract some value from the response xml and set download_url variable
doc = ElementTree.fromstring(xml)
for id in doc.findall('build'):
build_id = "%s" % (id.attrib['id'])
try:
download_url = "http://something.staging/guestAuth/repository/download/%s/%s:id/%s" % (
bt_number,
build_id,
build_artifact_zip
)
data[role_name]["download_url"] = download_url
except NameError:
print "something"
I think I should be recursively searching and updating
Using recursion
import json
json_txt = """
{
"ROLE_NAME": {
"FOO": {
"download_url": "http: //something.staging/12345/buzz.zip"
},
"BAR": {
"download_url": "http: //something.staging/12345/fizz.zip"
},
"download_url": "http: //something.staging/12345/fizzbuzz.zip",
"db_name": "somedb",
"db_server": "dbserver.staging.dmz",
"plugin": {
"server_url": "http: //lab.staging.corp/server/"
}
}
}
"""
data = json.loads(json_txt)
def fixup(adict, k, v):
for key in adict.keys():
if key == k:
adict[key] = v
elif type(adict[key]) is dict:
fixup(adict[key], k, v)
import pprint
pprint.pprint( data )
fixup(data, 'download_url', 'XXX')
pprint.pprint( data )
Output:
{u'ROLE_NAME': {u'BAR': {u'download_url': u'http: //something.staging/12345/fizz.zip'},
u'FOO': {u'download_url': u'http: //something.staging/12345/buzz.zip'},
u'db_name': u'somedb',
u'db_server': u'dbserver.staging.dmz',
u'download_url': u'http: //something.staging/12345/fizzbuzz.zip',
u'plugin': {u'server_url': u'http: //lab.staging.corp/server/'}}}
{u'ROLE_NAME': {u'BAR': {u'download_url': 'XXX'},
u'FOO': {u'download_url': 'XXX'},
u'db_name': u'somedb',
u'db_server': u'dbserver.staging.dmz',
u'download_url': 'XXX',
u'plugin': {u'server_url': u'http: //lab.staging.corp/server/'}}}

Categories

Resources