parse weird yaml file uploaded to server with python - python

I have a config server where we read the service config from.
In there we have a yaml file that I need to read but it has a weird format on the server looking like:
{
"document[0].Name": "os",
"document[0].Rules.Rule1": false,
"document[0].Rules.Rule2": true,
"document[0].MinScore": 100,
"document[0].MaxScore": 100,
"document[0].ClusterId": 22,
"document[0].Enabled": true,
"document[0].Module": "device",
"document[0].Description": "",
"document[0].Modified": 1577880000000,
"document[0].Created": 1577880000000,
"document[0].RequiredReview": false,
"document[0].Type": "NO_CODE",
"document[1].Name": "rule with params test",
"document[1].Rules.Rule": false,
"document[1].MinScore": 100,
"document[1].MaxScore": 100,
"document[1].ClusterId": 29,
"document[1].Enabled": true,
"document[1].Module": "device",
"document[1].Description": "rule with params test",
"document[1].Modified": 1577880000000,
"document[1].Created": 1577880000000,
"document[1].RequiredReview": false,
"document[1].Type": "NO_CODE",
"document[1].ParametersRules[0].Features.feature1.op": ">",
"document[1].ParametersRules[0].Features.feature1.value": 10,
"document[1].ParametersRules[0].Features.feature2.op": "==",
"document[1].ParametersRules[0].Features.feature2.value": true,
"document[1].ParametersRules[0].Features.feature3.op": "range",
"document[1].ParametersRules[0].Features.feature3.value[0]": 4,
"document[1].ParametersRules[0].Features.feature3.value[1]": 10,
"document[1].ParametersRules[0].Features.feature4.op": "!=",
"document[1].ParametersRules[0].Features.feature4.value": "None",
"document[1].ParametersRules[0].DecisionType": "all",
"document[1].ParametersRules[1].Features.feature5.op": "<",
"document[1].ParametersRules[1].Features.feature5.value": 1000,
"document[1].ParametersRules[1].DecisionType": "any"
}
and this is how the dict supposed to look like (might not be perfect I did it by hand):
[
{
"Name": "os",
"Rules": { "Rule1": false, "Rule2": true },
"MinScore": 100,
"MaxScore": 100,
"ClusterId": 22,
"Enabled": true,
"Module": "device",
"Description": "",
"Modified": 1577880000000,
"Created": 1577880000000,
"RequiredReview": false,
"Type": "NO_CODE"
},
{
"Name": "rule with params test",
"Rules": { "Rule": false},
"MinScore": 100,
"MaxScore": 100,
"ClusterId": 29,
"Enabled": true,
"Module": "device",
"Description": "rule with params test",
"Modified": 1577880000000,
"Created": 1577880000000,
"RequiredReview": false,
"Type": "NO_CODE",
"ParametersRules":[
{"Features": {"feature1": {"op": ">", "value": 10},
"feature2": {"op": "==", "value": true},
"feature3": {"op": "range", "value": [4,10]},
"feature4": {"op": "!=", "value": "None"}} ,
"DecisionType": "all"},
{"Features": { "feature5": { "op": "<", "value": 1000 }},
"DecisionType": "any"}
]
}
]
I don't have a way to change how the file is uploaded to the server (it's a different team and quite the headache) so I need to parse it using python.
My thought is that someone probably encountered it before so there must be a package that solves it, and I hoped that someone here might know.
Thanks.

i have a sample , i hope it'll help you
import yaml
import os
file_dir = os.path.dirname(os.path.abspath(__file__))
config = yaml.full_load(open(f"{file_dir}/file.json"))
yaml_file = open(f'{file_dir}/meta.yaml', 'w+')
yaml.dump(config, yaml_file, allow_unicode=True) # this one make your json file to yaml
your current output is :
- ClusterId: 22
Created: 1577880000000
Description: ''
Enabled: true
MaxScore: 100
MinScore: 100
Modified: 1577880000000
Module: device
Name: os
RequiredReview: false
Rules:
Rule1: false
Rule2: true
Type: NO_CODE
- ClusterId: 29
Created: 1577880000000
Description: rule with params test
Enabled: true
MaxScore: 100
MinScore: 100
Modified: 1577880000000
Module: device
Name: rule with params test
ParametersRules:
- DecisionType: all
Features:
feature1:
op: '>'
value: 10
feature2:
op: ==
value: true
feature3:
op: range
value:
- 4
- 10
feature4:
op: '!='
value: None
- DecisionType: any
Features:
feature5:
op: <
value: 1000
RequiredReview: false
Rules:
Rule: false
Type: NO_CODE

Here is my approach so far. It's far from perfect, but hope it gives you an idea of how to tackle it.
from __future__ import annotations # can be removed in Python 3.10+
def clean_value(o: str | bool | int) -> str | bool | int | None:
"""handle int, None, or bool values encoded as a string"""
if isinstance(o, str):
lowercase = o.lower()
if lowercase.isnumeric():
return int(o)
elif lowercase == 'none':
return None
elif lowercase in ('true', 'false'):
return lowercase == 'true'
# return eval(o.capitalize())
return o
# noinspection PyUnboundLocalVariable
def process(o: dict):
# final return list
docs_list = []
doc: dict[str, list | dict | str | bool | int | None]
doc_idx: int
def add_new_doc(new_idx: int):
"""Push new item to result list, and increment index."""
nonlocal doc_idx, doc
doc_idx = new_idx
doc = {}
docs_list.append(doc)
# add initial `dict` object to return list
add_new_doc(0)
for k, v in o.items():
doc_id, key, *parts = k.split('.')
doc_id: str
key: str
parts: list[str]
curr_doc_idx = int(doc_id.rsplit('[', 1)[1].rstrip(']'))
if curr_doc_idx > doc_idx:
add_new_doc(curr_doc_idx)
if not parts:
final_val = clean_value(v)
elif key in doc:
# For example, when we encounter `document[0].Rules.Rule2`, but we've already encountered
# `document[0].Rules.Rule1` - so in this case, we add value to the existing dict.
final_val = temp_dict = doc[key]
temp_dict: dict
for p in parts[:-1]:
temp_dict = temp_dict.setdefault(p, {})
temp_dict[parts[-1]] = clean_value(v)
else:
final_val = temp_dict = {}
for p in parts[:-1]:
temp_dict = temp_dict[p] = {}
temp_dict[parts[-1]] = clean_value(v)
doc[key] = final_val
return docs_list
if __name__ == '__main__':
import json
from pprint import pprint
j = """{
"document[0].Name": "os",
"document[0].Rules.Rule1": false,
"document[0].Rules.Rule2": "true",
"document[0].MinScore": 100,
"document[0].MaxScore": 100,
"document[0].ClusterId": 22,
"document[0].Enabled": true,
"document[0].Module": "device",
"document[0].Description": "",
"document[0].Modified": 1577880000000,
"document[0].Created": 1577880000000,
"document[0].RequiredReview": false,
"document[0].Type": "NO_CODE",
"document[1].Name": "rule with params test",
"document[1].Rules.Rule": false,
"document[1].MinScore": 100,
"document[1].MaxScore": 100,
"document[1].ClusterId": 29,
"document[1].Enabled": true,
"document[1].Module": "device",
"document[1].Description": "rule with params test",
"document[1].Modified": 1577880000000,
"document[1].Created": 1577880000000,
"document[1].RequiredReview": false,
"document[1].Type": "NO_CODE",
"document[1].ParametersRules[0].Features.feature1.op": ">",
"document[1].ParametersRules[0].Features.feature1.value": 10,
"document[1].ParametersRules[0].Features.feature2.op": "==",
"document[1].ParametersRules[0].Features.feature2.value": true,
"document[1].ParametersRules[0].Features.feature3.op": "range",
"document[1].ParametersRules[0].Features.feature3.value[0]": 4,
"document[1].ParametersRules[0].Features.feature3.value[1]": 10,
"document[1].ParametersRules[0].Features.feature4.op": "!=",
"document[1].ParametersRules[0].Features.feature4.value": "None",
"document[1].ParametersRules[0].DecisionType": "all",
"document[1].ParametersRules[1].Features.feature5.op": "<",
"document[1].ParametersRules[1].Features.feature5.value": 1000,
"document[1].ParametersRules[1].DecisionType": "any"
}"""
d: dict[str, str | bool | int | None] = json.loads(j)
result = process(d)
pprint(result)
Result:
[{'ClusterId': 22,
'Created': 1577880000000,
'Description': '',
'Enabled': True,
'MaxScore': 100,
'MinScore': 100,
'Modified': 1577880000000,
'Module': 'device',
'Name': 'os',
'RequiredReview': False,
'Rules': {'Rule1': False, 'Rule2': True},
'Type': 'NO_CODE'},
{'ClusterId': 29,
'Created': 1577880000000,
'Description': 'rule with params test',
'Enabled': True,
'MaxScore': 100,
'MinScore': 100,
'Modified': 1577880000000,
'Module': 'device',
'Name': 'rule with params test',
'ParametersRules[0]': {'DecisionType': 'all',
'Features': {'feature1': {'value': 10},
'feature2': {'op': '==', 'value': True},
'feature3': {'op': 'range',
'value[0]': 4,
'value[1]': 10},
'feature4': {'op': '!=', 'value': None}}},
'ParametersRules[1]': {'DecisionType': 'any',
'Features': {'feature5': {'value': 1000}}},
'RequiredReview': False,
'Rules': {'Rule': False},
'Type': 'NO_CODE'}]
Of course one of the problems is that it doesn't accounted for nested paths like document[1].ParametersRules[0].Features.feature1.op which should ideally create a new sub-list to add values to.

Related

retrieving data from json with python

I have a nested JSON data. I want to get the value of key "name" inside the dictionary "value" based on the key "id" in "key" dictionary (let the user enter the id). I don't want to use indexing which, because places are changing on every url differently. Also data is large, so I need one row solution (without for loop).
Code
import requests, re, json
r = requests.get('https://www.trendyol.com/apple/macbook-air-13-m1-8gb-256gb-ssd-altin-p-67940132').text
json_data1 = json.loads(re.search(r"window.__PRODUCT_DETAIL_APP_INITIAL_STATE__=({.*}});window", r).group(1))
print(json_data1)
print('json_data1:',json_data1['product']['attributes'][0]['value']['name'])
Output
{'product': {'attributes': [{'key': {'name': 'İşlemci Tipi', 'id': 168}, 'value': {'name': 'Apple M1', 'id': 243383}, 'starred': True, 'description': '', 'mediaUrls': []}, {'key': {'name': 'SSD Kapasitesi', 'id': 249}..........
json_data1: Apple M1
JSON Data
{
"product": {
"attributes": [
{
"key": { "name": "İşlemci Tipi", "id": 168 },
"value": { "name": "Apple M1", "id": 243383 },
"starred": true,
"description": "",
"mediaUrls": []
},
{
"key": { "name": "SSD Kapasitesi", "id": 249 },
"value": { "name": "256 GB", "id": 3376 },
"starred": true,
"description": "",
"mediaUrls": []
},
.
.
.
]
}
}
Expected Output is getting value by key id: (type must be str)
input >> id: 168
output >> name: Apple M1
Since you originally didn't want a for loop, but now it's a matter of speed,
Here's a solution with for loop, you can test it and see if it's faster than the one you already had
import json
with open("file.json") as f:
data = json.load(f)
search_key = int(input("Enter id: "))
for i in range(0, len(data['product']['attributes'])):
if search_key == data['product']['attributes'][i]['key']['id']:
print(data['product']['attributes'][i]['value']['name'])
Input >> Enter id: 168
Output >> Apple M1
I found the solution with for loop. It works fast so I preferred it.
for i in json_data1['product']['attributes']:
cpu = list(list(i.values())[0].values())[1]
if cpu == 168:
print(list(list(i.values())[1].values())[0])
Iteration is unavoidable if the index is unknown, but the cost can be reduced substantially by using a generator expression and Python's built-in next function:
next((x["value"]["name"] for x in data["product"]["attributes"] if x["key"]["id"] == 168), None)
To verify that a generator expression is in fact faster than a for loop, here is a comparison of the running time of xFranko's solution and the above:
import time
def time_func(func):
def timer(*args):
time1 = time.perf_counter()
func(*args)
time2 = time.perf_counter()
return (time2 - time1) * 1000
return timer
number_of_attributes = 100000
data = {
"product": {
"attributes": [
{
"key": { "name": "İşlemci Tipi", "id": i },
"value": { "name": "name" + str(i), "id": 243383 },
"starred": True,
"description": "",
"mediaUrls": []
} for i in range(number_of_attributes)
]
}
}
def getName_generator(id):
return next((x["value"]["name"] for x in data["product"]["attributes"] if x["key"]["id"] == id), None)
def getName_for_loop(id):
return_value = None
for i in range(0, len(data['product']['attributes'])):
if id == data['product']['attributes'][i]['key']['id']:
return_value = data['product']['attributes'][i]['value']['name']
return return_value
print("Generator:", time_func(getName_generator)(0))
print("For loop:", time_func(getName_for_loop)(0))
print()
print("Generator:", time_func(getName_generator)(number_of_attributes - 1))
print("For loop:", time_func(getName_for_loop)(number_of_attributes - 1))
My results:
Generator: 0.0075999999999964984
For loop: 43.73920000000003
Generator: 23.633300000000023
For loop: 49.839699999999986
Conclusion:
For large data sets, a generator expression is indeed faster, even if it has to traverse the entire set.

Change value dynamically with click.option callback function

So I want to Change default list in program by reading option from command line.
Using this answer by #Stephen Rauch and add simple option to click with simple callback function set_list() it seems to run before creating command line arguments but my default_list global variable not changed.
Here is The Code
import click
run_options1 = [
{
"name": "name1",
"short": "a",
"long": "ace",
"type": "string",
"required": False
}, {
"name": "name2",
"short": "b",
"long": "bravo",
"type": "number",
"required": True
}, {
"name": "name3",
"short": "c",
"long": "candy",
"type": "array",
"required": True
}
]
run_options2 = [
{
"name": "family1",
"short": "a",
"long": "ace",
"type": "string",
"required": False
}, {
"name": "family2",
"short": "b",
"long": "bravo",
"type": "number",
"required": True
}, {
"name": "family3",
"short": "c",
"long": "candy",
"type": "array",
"required": True
}
]
# set default list global variable
default_option = run_options1
class OptionEatAll(click.Option):
def __init__(self, *args, **kwargs):
self.save_other_options = kwargs.pop('save_other_options', True)
nargs = kwargs.pop('nargs', -1)
assert nargs == -1, 'nargs, if set, must be -1 not {}'.format(nargs)
super(OptionEatAll, self).__init__(*args, **kwargs)
self._previous_parser_process = None
self._eat_all_parser = None
def add_to_parser(self, parser, ctx):
def parser_process(value, state):
# method to hook to the parser.process
done = False
value = [value]
if self.save_other_options:
# grab everything up to the next option
while state.rargs and not done:
for prefix in self._eat_all_parser.prefixes:
if state.rargs[0].startswith(prefix):
done = True
if not done:
value.append(state.rargs.pop(0))
else:
# grab everything remaining
value += state.rargs
state.rargs[:] = []
value = tuple(value)
# call the actual process
self._previous_parser_process(value, state)
retval = super(OptionEatAll, self).add_to_parser(parser, ctx)
for name in self.opts:
our_parser = parser._long_opt.get(
name) or parser._short_opt.get(name)
if our_parser:
self._eat_all_parser = our_parser
self._previous_parser_process = our_parser.process
our_parser.process = parser_process
break
return retval
def options_from_db(options):
map_to_types = dict(
array=str,
number=float,
string=str,
)
def decorator(f):
for opt_params in reversed(options):
param_decls = (
'-' + opt_params['short'],
'--' + opt_params['long'],
opt_params['name'])
attrs = dict(
required=opt_params['required'],
type=map_to_types.get(
opt_params['type'], opt_params['type'])
)
if opt_params['type'] == 'array':
attrs['cls'] = OptionEatAll
attrs['nargs'] = -1
click.option(*param_decls, **attrs)(f)
return f
return decorator
#click.group()
def cli():
pass
def set_list(ctx, param, value):
if not value or ctx.resilient_parsing:
return
global default_option
if value == "l2":
print("change default list")
default_option = run_options2
print("default list", default_option)
#cli.command()
#click.option('--list', callback=set_list,
expose_value=False,is_eager=True)
#options_from_db(default_option)
#click.argument('url')
def run(*args, **kwargs):
click.echo('args: {}'.format(args))
click.echo('kwargs: {}'.format(kwargs))
if __name__ == "__main__":
commands = (
'run www.mysite.com/api/get -a testparam --bravo 5 -c item1 item2 --list l2',
'',
'--help',
'run --help',
)
import time
time.sleep(1)
for cmd in commands:
try:
time.sleep(0.1)
print('-----------')
print('> ' + cmd)
time.sleep(0.1)
cli(cmd.split())
except BaseException as exc:
if str(exc) != '0' and \
not isinstance(exc, (click.ClickException, SystemExit)):
raise
The program will generate this output
Results
> run www.mysite.com/api/get -a testparam --bravo 5 -c item1 item2 --list l2
change default list
default list [{'name': 'family1', 'short': 'a', 'long': 'ace', 'type': 'string', 'required': False}, {'name': 'family2', 'short': 'b', 'long': 'bravo', 'type': 'number', 'required': True}, {'name': 'family3', 'short': 'c', 'long': 'candy', 'type': 'array', 'required': True}]
args: ()
kwargs: {'name1': 'testparam', 'name2': 5.0, 'name3': ('item1', 'item2'), 'url': 'www.mysite.com/api/get'}
-----------
Actually default_option not changed.
If it works the output must be
kwargs: {'family1': 'testparam', 'family2': 5.0, 'family3': ('item1', 'item2'), 'url': 'www.mysite.com/api/get'}
What can I do to implement that?

sanitize unicode from json

how do I properly remove unicode so I can load the json
data = json.loads(json_string)
json.decoder.JSONDecodeError: Invalid \escape: line 1 column 72 (char 71)
{"user": {"user_id": 455830511, "username": "dualipa_384", "name": "Dua\xa0Lipa", "private": false, "verified_user": false, "avatar_url": "https://uploads.cdn.triller.co/v1/avatars/455830511/1619366527_avatar.jpg", "profile_cover_url": "None", "dm_registered": true, "storefront_url": "None", "creator_status": false, "contributor_status": false, "user_uuid": "bce20042-a143-4caf-adbc-6b39bbb2d30a", "about_me": "Go stream my new album Future Nostalgia The Moonlight Edition❤️\ndualipa.co/weregood-video", "auto_confirmed": true, "instagram_handle": "#dualipa", "instagram_verified": false, "soundcloud_url": "None", "button_text": "None", "button_text_color": "None", "button_background_color": "None", "button_url": "None", "follower_count": 0, "followed_count": 55, "verified": true, "failed_age_validation": false, "has_snaps": false, "profile_type": "public", "blocking_user": false, "blocked_by_user": false, "followed_by_me": "false", "follower_of_me": "false", "subscription": {"is_subscribed": false}}, "status": true}
I have tried to do the following but it did not work
json_string = json_string.replace(u'\xa0', u'')
json_string = unicodedata.normalize("NFKD", json_string)
There is a newline character within a string. JSON does not allow line breaks withing strings. Replace the line break with an escape sequence:
json.loads(json_string.replace('\n', r'\n'))
this how it worked for me
import json
import unicodedata
json_string = json.loads(json.dumps(json_string))
json_string = json_string.replace("\"false\"", "\"False\"").replace("false", "\"False\"").replace("true", "\"True\"").replace("\n", " ")
json_string = unicodedata.normalize("NFKD", json_string)
json_string = json_string.replace(u'\xa0', u'')
json_string = json_string.replace('\n', r'\n')
data = json.loads(json_string)
print(data)

Schrodingers JSON - when working with a json doc it is erroring as both a list and a dict

I have a Python list from an API call:
[{'_id': '5f563c1bf8eaa9d98eca231f',
'allEnabledDs': None,
'allEnabledIdSor': None,
'correlationFilterEntitySource': True,
'correlation_completed': True,
'correlation_completed_dt': '2020-09-07T13:56:43.547Z',
'created_at': '2020-09-07T13:56:43.469Z',
'dsConnectionList': None,
'folderToLabelMapping': None,
'idConnectionList': None,
'identities_scanned': 0,
'identityResolutionScan': False,
'info': None,
'isCustomScanProfile': None,
'modelId': None,
'name': 'Identity Discovery Scan',
'origin': 'Identity Discovery Scan',
'piisummary_completed_dt': '2020-09-07T13:56:43.642Z',
'scan_progress_status': {'Started': '2020-09-07T13:56:43.469Z'},
'shouldCreateClassifiers': None,
'skipIdScan': None,
'state': 'Started',
'stopRequested': True,
'type': 'identityDiscoveryScan',
'updated_at': '2020-09-07T16:59:45.294Z'}]
And this is my code:
for i in live_scans:
url = url
payload = {}
headers = {
"Authorization": token
}
r = requests.get(url, headers=headers, data=payload)
j_doc = r.json()
d = {k:v for k,v in (x.split(':') for x in j_doc)}
if j_doc['state'] == "Stopped":
print("YAY!")
if d['state'] == "Stopped":
print("YAY!")
However when using this code:
if n_dict['state'] == "Stopped":
print("YAY!")
This error occurs:
TypeError: list indices must be integers or slices, not str>
And when attempting to split the list into a dict with:
d = {k:v for k,v in (x.split(':') for x in j_doc)}
Can someone give me a pointer into why this is happening and how to fix it?
As #Hitobat mentioned in commend - you have list with dictionary inside so you have to use [0] to get this dictionary. Or you have to use for-loop if you have more elements on list
data = [{'_id': '5f563c1bf8eaa9d98eca231f', 'allEnabledDs': None, 'allEnabledIdSor': None, 'correlationFilterEntitySource': True, 'created_at': '2020-09-07T13:56:43.469Z', 'dsConnectionList': None, 'folderToLabelMapping': None, 'idConnectionList': None, 'identityResolutionScan': False, 'info': None, 'isCustomScanProfile': None, 'modelId': None, 'name': 'Identity Discovery Scan', 'origin': 'Identity Discovery Scan', 'scan_progress_status': {'Started': '2020-09-07T13:56:43.469Z'}, 'shouldCreateClassifiers': None, 'skipIdScan': None, 'state': 'Started', 'type': 'identityDiscoveryScan', 'updated_at': '2020-09-07T16:59:45.294Z', 'identities_scanned': 0, 'correlation_completed': True, 'correlation_completed_dt': '2020-09-07T13:56:43.547Z', 'piisummary_completed_dt': '2020-09-07T13:56:43.642Z', 'stopRequested': True}]
print( data[0]['state'] )
for item in data:
print( item['state'] )
Next time you can use type() to check what you have -
print( type(data) )
if it list then you can test length and/or check first element
print( len(data) )
print( type( data[0] ) )
if it is dict then you can check what keys you can use
print( data[0].keys() )
This way you can recognize how to get expected element(s).
You can also use json to format it with indentations and see how it looks like
import json
print( json.dumps(data, indent=2) )
Results:
[
{
"_id": "5f563c1bf8eaa9d98eca231f",
"allEnabledDs": null,
"allEnabledIdSor": null,
"correlationFilterEntitySource": true,
"created_at": "2020-09-07T13:56:43.469Z",
"dsConnectionList": null,
"folderToLabelMapping": null,
"idConnectionList": null,
"identityResolutionScan": false,
"info": null,
"isCustomScanProfile": null,
"modelId": null,
"name": "Identity Discovery Scan",
"origin": "Identity Discovery Scan",
"scan_progress_status": {
"Started": "2020-09-07T13:56:43.469Z"
},
"shouldCreateClassifiers": null,
"skipIdScan": null,
"state": "Started",
"type": "identityDiscoveryScan",
"updated_at": "2020-09-07T16:59:45.294Z",
"identities_scanned": 0,
"correlation_completed": true,
"correlation_completed_dt": "2020-09-07T13:56:43.547Z",
"piisummary_completed_dt": "2020-09-07T13:56:43.642Z",
"stopRequested": true
}
]
Similar way you can use pprint (Pretty Print)
import pprint
pprint.pprint(data)
Result:
[{'_id': '5f563c1bf8eaa9d98eca231f',
'allEnabledDs': None,
'allEnabledIdSor': None,
'correlationFilterEntitySource': True,
'correlation_completed': True,
'correlation_completed_dt': '2020-09-07T13:56:43.547Z',
'created_at': '2020-09-07T13:56:43.469Z',
'dsConnectionList': None,
'folderToLabelMapping': None,
'idConnectionList': None,
'identities_scanned': 0,
'identityResolutionScan': False,
'info': None,
'isCustomScanProfile': None,
'modelId': None,
'name': 'Identity Discovery Scan',
'origin': 'Identity Discovery Scan',
'piisummary_completed_dt': '2020-09-07T13:56:43.642Z',
'scan_progress_status': {'Started': '2020-09-07T13:56:43.469Z'},
'shouldCreateClassifiers': None,
'skipIdScan': None,
'state': 'Started',
'stopRequested': True,
'type': 'identityDiscoveryScan',
'updated_at': '2020-09-07T16:59:45.294Z'}]

AWS Glue: How to expand nested Hive struct to Dict?

I'm trying to expand field mappings in a Table mapped by my AWS Glue crawler to a nested dictionary in Python. But, I can't find any Spark/Hive parsers to deserialize the
var_type = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
string located in table_schema['Table']['StorageDescriptor']['Columns'] to a Python dict.
How to dump the table definition in Glue:
import boto3
client = boto3.client('glue')
client.get_table(DatabaseName=selected_db, Name=selected_table)
Response:
table_schema = {'Table': {'Name': 'asdfasdf',
'DatabaseName': 'asdfasdf',
'Owner': 'owner',
'CreateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'UpdateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'LastAccessTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'Retention': 0,
'StorageDescriptor': {'Columns': [{'Name': 'version', 'Type': 'int'},
{'Name': 'payload',
'Type': 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'},
{'Name': 'origin', 'Type': 'string'}],
'Location': 's3://asdfasdf/',
'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
'Compressed': False,
'NumberOfBuckets': -1,
'SerdeInfo': {'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe',
'Parameters': {'paths': 'origin,payload,version'}},
'BucketColumns': [],
'SortColumns': [],
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'StoredAsSubDirectories': False},
'PartitionKeys': [{'Name': 'partition_0', 'Type': 'string'},
{'Name': 'partition_1', 'Type': 'string'},
{'Name': 'partition_2', 'Type': 'string'}],
'TableType': 'EXTERNAL_TABLE',
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'CreatedBy': 'arn:aws:sts::asdfasdf'},
'ResponseMetadata': {'RequestId': 'asdfasdf',
'HTTPStatusCode': 200,
'HTTPHeaders': {'date': 'Thu, 01 Aug 2019 16:23:06 GMT',
'content-type': 'application/x-amz-json-1.1',
'content-length': '3471',
'connection': 'keep-alive',
'x-amzn-requestid': 'asdfasdf'},
'RetryAttempts': 0}}
Goal would be a python dictionary and values for each field type, vs. the embedded string. E.g.
expand_function('struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'})
returns
{
'loc_lat':'double',
'service_handler':'string',
'ip_address':'string',
'device':'bigint',
'source':{'id':'string',
'contacts': {
'admin': {
'email':'string',
'name':'string'
},
'name':'string'
},
'loc_name':'string'
}
Thanks!
The accepted answer doesn't handle arrays.
This solution does:
import json
import re
def _hive_struct_to_json(hive_str):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
r = re.compile(r'(.*?)(struct<|array<|[:,>])(.*)')
root = dict()
to_parse = hive_str
parents = []
curr_elem = root
key = None
while to_parse:
left, operator, to_parse = r.match(to_parse).groups()
if operator == 'struct<' or operator == 'array<':
parents.append(curr_elem)
new_elem = dict() if operator == 'struct<' else list()
if key:
curr_elem[key] = new_elem
curr_elem = new_elem
elif isinstance(curr_elem, list):
curr_elem.append(new_elem)
curr_elem = new_elem
key = None
elif operator == ':':
key = left
elif operator == ',' or operator == '>':
if left:
if isinstance(curr_elem, dict):
curr_elem[key] = left
elif isinstance(curr_elem, list):
curr_elem.append(left)
if operator == '>':
curr_elem = parents.pop()
return root
hive_str = '''
struct<
loc_lat:double,
service_handler:string,
ip_address:string,
device:bigint,
source:struct<
id:string,
contacts:struct<
admin:struct<
email:string,
name:array<string>
>
>,
name:string
>,
loc_name:string,
tags:array<
struct<
key:string,
value:string
>
>
>
'''
hive_str = re.sub(r'[\s]+', '', hive_str).strip()
print(hive_str)
print(json.dumps(_hive_struct_to_json(hive_str), indent=2))
Prints:
struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:array<string>>>,name:string>,loc_name:string,tags:array<struct<key:string,value:string>>>
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": [
"string"
]
}
},
"name": "string"
},
"loc_name": "string",
"tags": [
{
"key": "string",
"value": "string"
}
]
}
Here's a function running on the embedded Hive struct string above.
def _hive_struct_to_json(hive_struct):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
# Convert embedded hive type definition string to JSON
hive_struct = hive_struct.replace(':', '":"')
hive_struct = hive_struct.replace(',', '","')
hive_struct = hive_struct.replace('struct<', '{"')
hive_struct = hive_struct.replace('"{"', '{"')
hive_struct = hive_struct.replace('>', '"}')
hive_struct = hive_struct.replace('}"', '}')
return json.loads(hive_struct)
hive_str = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
print(json.dumps(_hive_struct_to_json(hive_str),indent=2))
Returns:
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": "string"
}
},
"name": "string"
},
"loc_name": "string"
}
I tried to scout from some existing ways and found some helper functions from pyspark.
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tmp").getOrCreate()
struct_map = T._parse_datatype_string("MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>")
struct_map is a pyspark type that inturn has nested fields to iterate over. Once you have an object like above, performing a recursive call to flatten it should be easy. I am open to hearing opinions from others about this approach.

Categories

Resources