Change value dynamically with click.option callback function - python

So I want to Change default list in program by reading option from command line.
Using this answer by #Stephen Rauch and add simple option to click with simple callback function set_list() it seems to run before creating command line arguments but my default_list global variable not changed.
Here is The Code
import click
run_options1 = [
{
"name": "name1",
"short": "a",
"long": "ace",
"type": "string",
"required": False
}, {
"name": "name2",
"short": "b",
"long": "bravo",
"type": "number",
"required": True
}, {
"name": "name3",
"short": "c",
"long": "candy",
"type": "array",
"required": True
}
]
run_options2 = [
{
"name": "family1",
"short": "a",
"long": "ace",
"type": "string",
"required": False
}, {
"name": "family2",
"short": "b",
"long": "bravo",
"type": "number",
"required": True
}, {
"name": "family3",
"short": "c",
"long": "candy",
"type": "array",
"required": True
}
]
# set default list global variable
default_option = run_options1
class OptionEatAll(click.Option):
def __init__(self, *args, **kwargs):
self.save_other_options = kwargs.pop('save_other_options', True)
nargs = kwargs.pop('nargs', -1)
assert nargs == -1, 'nargs, if set, must be -1 not {}'.format(nargs)
super(OptionEatAll, self).__init__(*args, **kwargs)
self._previous_parser_process = None
self._eat_all_parser = None
def add_to_parser(self, parser, ctx):
def parser_process(value, state):
# method to hook to the parser.process
done = False
value = [value]
if self.save_other_options:
# grab everything up to the next option
while state.rargs and not done:
for prefix in self._eat_all_parser.prefixes:
if state.rargs[0].startswith(prefix):
done = True
if not done:
value.append(state.rargs.pop(0))
else:
# grab everything remaining
value += state.rargs
state.rargs[:] = []
value = tuple(value)
# call the actual process
self._previous_parser_process(value, state)
retval = super(OptionEatAll, self).add_to_parser(parser, ctx)
for name in self.opts:
our_parser = parser._long_opt.get(
name) or parser._short_opt.get(name)
if our_parser:
self._eat_all_parser = our_parser
self._previous_parser_process = our_parser.process
our_parser.process = parser_process
break
return retval
def options_from_db(options):
map_to_types = dict(
array=str,
number=float,
string=str,
)
def decorator(f):
for opt_params in reversed(options):
param_decls = (
'-' + opt_params['short'],
'--' + opt_params['long'],
opt_params['name'])
attrs = dict(
required=opt_params['required'],
type=map_to_types.get(
opt_params['type'], opt_params['type'])
)
if opt_params['type'] == 'array':
attrs['cls'] = OptionEatAll
attrs['nargs'] = -1
click.option(*param_decls, **attrs)(f)
return f
return decorator
#click.group()
def cli():
pass
def set_list(ctx, param, value):
if not value or ctx.resilient_parsing:
return
global default_option
if value == "l2":
print("change default list")
default_option = run_options2
print("default list", default_option)
#cli.command()
#click.option('--list', callback=set_list,
expose_value=False,is_eager=True)
#options_from_db(default_option)
#click.argument('url')
def run(*args, **kwargs):
click.echo('args: {}'.format(args))
click.echo('kwargs: {}'.format(kwargs))
if __name__ == "__main__":
commands = (
'run www.mysite.com/api/get -a testparam --bravo 5 -c item1 item2 --list l2',
'',
'--help',
'run --help',
)
import time
time.sleep(1)
for cmd in commands:
try:
time.sleep(0.1)
print('-----------')
print('> ' + cmd)
time.sleep(0.1)
cli(cmd.split())
except BaseException as exc:
if str(exc) != '0' and \
not isinstance(exc, (click.ClickException, SystemExit)):
raise
The program will generate this output
Results
> run www.mysite.com/api/get -a testparam --bravo 5 -c item1 item2 --list l2
change default list
default list [{'name': 'family1', 'short': 'a', 'long': 'ace', 'type': 'string', 'required': False}, {'name': 'family2', 'short': 'b', 'long': 'bravo', 'type': 'number', 'required': True}, {'name': 'family3', 'short': 'c', 'long': 'candy', 'type': 'array', 'required': True}]
args: ()
kwargs: {'name1': 'testparam', 'name2': 5.0, 'name3': ('item1', 'item2'), 'url': 'www.mysite.com/api/get'}
-----------
Actually default_option not changed.
If it works the output must be
kwargs: {'family1': 'testparam', 'family2': 5.0, 'family3': ('item1', 'item2'), 'url': 'www.mysite.com/api/get'}
What can I do to implement that?

Related

retrieving data from json with python

I have a nested JSON data. I want to get the value of key "name" inside the dictionary "value" based on the key "id" in "key" dictionary (let the user enter the id). I don't want to use indexing which, because places are changing on every url differently. Also data is large, so I need one row solution (without for loop).
Code
import requests, re, json
r = requests.get('https://www.trendyol.com/apple/macbook-air-13-m1-8gb-256gb-ssd-altin-p-67940132').text
json_data1 = json.loads(re.search(r"window.__PRODUCT_DETAIL_APP_INITIAL_STATE__=({.*}});window", r).group(1))
print(json_data1)
print('json_data1:',json_data1['product']['attributes'][0]['value']['name'])
Output
{'product': {'attributes': [{'key': {'name': 'İşlemci Tipi', 'id': 168}, 'value': {'name': 'Apple M1', 'id': 243383}, 'starred': True, 'description': '', 'mediaUrls': []}, {'key': {'name': 'SSD Kapasitesi', 'id': 249}..........
json_data1: Apple M1
JSON Data
{
"product": {
"attributes": [
{
"key": { "name": "İşlemci Tipi", "id": 168 },
"value": { "name": "Apple M1", "id": 243383 },
"starred": true,
"description": "",
"mediaUrls": []
},
{
"key": { "name": "SSD Kapasitesi", "id": 249 },
"value": { "name": "256 GB", "id": 3376 },
"starred": true,
"description": "",
"mediaUrls": []
},
.
.
.
]
}
}
Expected Output is getting value by key id: (type must be str)
input >> id: 168
output >> name: Apple M1
Since you originally didn't want a for loop, but now it's a matter of speed,
Here's a solution with for loop, you can test it and see if it's faster than the one you already had
import json
with open("file.json") as f:
data = json.load(f)
search_key = int(input("Enter id: "))
for i in range(0, len(data['product']['attributes'])):
if search_key == data['product']['attributes'][i]['key']['id']:
print(data['product']['attributes'][i]['value']['name'])
Input >> Enter id: 168
Output >> Apple M1
I found the solution with for loop. It works fast so I preferred it.
for i in json_data1['product']['attributes']:
cpu = list(list(i.values())[0].values())[1]
if cpu == 168:
print(list(list(i.values())[1].values())[0])
Iteration is unavoidable if the index is unknown, but the cost can be reduced substantially by using a generator expression and Python's built-in next function:
next((x["value"]["name"] for x in data["product"]["attributes"] if x["key"]["id"] == 168), None)
To verify that a generator expression is in fact faster than a for loop, here is a comparison of the running time of xFranko's solution and the above:
import time
def time_func(func):
def timer(*args):
time1 = time.perf_counter()
func(*args)
time2 = time.perf_counter()
return (time2 - time1) * 1000
return timer
number_of_attributes = 100000
data = {
"product": {
"attributes": [
{
"key": { "name": "İşlemci Tipi", "id": i },
"value": { "name": "name" + str(i), "id": 243383 },
"starred": True,
"description": "",
"mediaUrls": []
} for i in range(number_of_attributes)
]
}
}
def getName_generator(id):
return next((x["value"]["name"] for x in data["product"]["attributes"] if x["key"]["id"] == id), None)
def getName_for_loop(id):
return_value = None
for i in range(0, len(data['product']['attributes'])):
if id == data['product']['attributes'][i]['key']['id']:
return_value = data['product']['attributes'][i]['value']['name']
return return_value
print("Generator:", time_func(getName_generator)(0))
print("For loop:", time_func(getName_for_loop)(0))
print()
print("Generator:", time_func(getName_generator)(number_of_attributes - 1))
print("For loop:", time_func(getName_for_loop)(number_of_attributes - 1))
My results:
Generator: 0.0075999999999964984
For loop: 43.73920000000003
Generator: 23.633300000000023
For loop: 49.839699999999986
Conclusion:
For large data sets, a generator expression is indeed faster, even if it has to traverse the entire set.

parse weird yaml file uploaded to server with python

I have a config server where we read the service config from.
In there we have a yaml file that I need to read but it has a weird format on the server looking like:
{
"document[0].Name": "os",
"document[0].Rules.Rule1": false,
"document[0].Rules.Rule2": true,
"document[0].MinScore": 100,
"document[0].MaxScore": 100,
"document[0].ClusterId": 22,
"document[0].Enabled": true,
"document[0].Module": "device",
"document[0].Description": "",
"document[0].Modified": 1577880000000,
"document[0].Created": 1577880000000,
"document[0].RequiredReview": false,
"document[0].Type": "NO_CODE",
"document[1].Name": "rule with params test",
"document[1].Rules.Rule": false,
"document[1].MinScore": 100,
"document[1].MaxScore": 100,
"document[1].ClusterId": 29,
"document[1].Enabled": true,
"document[1].Module": "device",
"document[1].Description": "rule with params test",
"document[1].Modified": 1577880000000,
"document[1].Created": 1577880000000,
"document[1].RequiredReview": false,
"document[1].Type": "NO_CODE",
"document[1].ParametersRules[0].Features.feature1.op": ">",
"document[1].ParametersRules[0].Features.feature1.value": 10,
"document[1].ParametersRules[0].Features.feature2.op": "==",
"document[1].ParametersRules[0].Features.feature2.value": true,
"document[1].ParametersRules[0].Features.feature3.op": "range",
"document[1].ParametersRules[0].Features.feature3.value[0]": 4,
"document[1].ParametersRules[0].Features.feature3.value[1]": 10,
"document[1].ParametersRules[0].Features.feature4.op": "!=",
"document[1].ParametersRules[0].Features.feature4.value": "None",
"document[1].ParametersRules[0].DecisionType": "all",
"document[1].ParametersRules[1].Features.feature5.op": "<",
"document[1].ParametersRules[1].Features.feature5.value": 1000,
"document[1].ParametersRules[1].DecisionType": "any"
}
and this is how the dict supposed to look like (might not be perfect I did it by hand):
[
{
"Name": "os",
"Rules": { "Rule1": false, "Rule2": true },
"MinScore": 100,
"MaxScore": 100,
"ClusterId": 22,
"Enabled": true,
"Module": "device",
"Description": "",
"Modified": 1577880000000,
"Created": 1577880000000,
"RequiredReview": false,
"Type": "NO_CODE"
},
{
"Name": "rule with params test",
"Rules": { "Rule": false},
"MinScore": 100,
"MaxScore": 100,
"ClusterId": 29,
"Enabled": true,
"Module": "device",
"Description": "rule with params test",
"Modified": 1577880000000,
"Created": 1577880000000,
"RequiredReview": false,
"Type": "NO_CODE",
"ParametersRules":[
{"Features": {"feature1": {"op": ">", "value": 10},
"feature2": {"op": "==", "value": true},
"feature3": {"op": "range", "value": [4,10]},
"feature4": {"op": "!=", "value": "None"}} ,
"DecisionType": "all"},
{"Features": { "feature5": { "op": "<", "value": 1000 }},
"DecisionType": "any"}
]
}
]
I don't have a way to change how the file is uploaded to the server (it's a different team and quite the headache) so I need to parse it using python.
My thought is that someone probably encountered it before so there must be a package that solves it, and I hoped that someone here might know.
Thanks.
i have a sample , i hope it'll help you
import yaml
import os
file_dir = os.path.dirname(os.path.abspath(__file__))
config = yaml.full_load(open(f"{file_dir}/file.json"))
yaml_file = open(f'{file_dir}/meta.yaml', 'w+')
yaml.dump(config, yaml_file, allow_unicode=True) # this one make your json file to yaml
your current output is :
- ClusterId: 22
Created: 1577880000000
Description: ''
Enabled: true
MaxScore: 100
MinScore: 100
Modified: 1577880000000
Module: device
Name: os
RequiredReview: false
Rules:
Rule1: false
Rule2: true
Type: NO_CODE
- ClusterId: 29
Created: 1577880000000
Description: rule with params test
Enabled: true
MaxScore: 100
MinScore: 100
Modified: 1577880000000
Module: device
Name: rule with params test
ParametersRules:
- DecisionType: all
Features:
feature1:
op: '>'
value: 10
feature2:
op: ==
value: true
feature3:
op: range
value:
- 4
- 10
feature4:
op: '!='
value: None
- DecisionType: any
Features:
feature5:
op: <
value: 1000
RequiredReview: false
Rules:
Rule: false
Type: NO_CODE
Here is my approach so far. It's far from perfect, but hope it gives you an idea of how to tackle it.
from __future__ import annotations # can be removed in Python 3.10+
def clean_value(o: str | bool | int) -> str | bool | int | None:
"""handle int, None, or bool values encoded as a string"""
if isinstance(o, str):
lowercase = o.lower()
if lowercase.isnumeric():
return int(o)
elif lowercase == 'none':
return None
elif lowercase in ('true', 'false'):
return lowercase == 'true'
# return eval(o.capitalize())
return o
# noinspection PyUnboundLocalVariable
def process(o: dict):
# final return list
docs_list = []
doc: dict[str, list | dict | str | bool | int | None]
doc_idx: int
def add_new_doc(new_idx: int):
"""Push new item to result list, and increment index."""
nonlocal doc_idx, doc
doc_idx = new_idx
doc = {}
docs_list.append(doc)
# add initial `dict` object to return list
add_new_doc(0)
for k, v in o.items():
doc_id, key, *parts = k.split('.')
doc_id: str
key: str
parts: list[str]
curr_doc_idx = int(doc_id.rsplit('[', 1)[1].rstrip(']'))
if curr_doc_idx > doc_idx:
add_new_doc(curr_doc_idx)
if not parts:
final_val = clean_value(v)
elif key in doc:
# For example, when we encounter `document[0].Rules.Rule2`, but we've already encountered
# `document[0].Rules.Rule1` - so in this case, we add value to the existing dict.
final_val = temp_dict = doc[key]
temp_dict: dict
for p in parts[:-1]:
temp_dict = temp_dict.setdefault(p, {})
temp_dict[parts[-1]] = clean_value(v)
else:
final_val = temp_dict = {}
for p in parts[:-1]:
temp_dict = temp_dict[p] = {}
temp_dict[parts[-1]] = clean_value(v)
doc[key] = final_val
return docs_list
if __name__ == '__main__':
import json
from pprint import pprint
j = """{
"document[0].Name": "os",
"document[0].Rules.Rule1": false,
"document[0].Rules.Rule2": "true",
"document[0].MinScore": 100,
"document[0].MaxScore": 100,
"document[0].ClusterId": 22,
"document[0].Enabled": true,
"document[0].Module": "device",
"document[0].Description": "",
"document[0].Modified": 1577880000000,
"document[0].Created": 1577880000000,
"document[0].RequiredReview": false,
"document[0].Type": "NO_CODE",
"document[1].Name": "rule with params test",
"document[1].Rules.Rule": false,
"document[1].MinScore": 100,
"document[1].MaxScore": 100,
"document[1].ClusterId": 29,
"document[1].Enabled": true,
"document[1].Module": "device",
"document[1].Description": "rule with params test",
"document[1].Modified": 1577880000000,
"document[1].Created": 1577880000000,
"document[1].RequiredReview": false,
"document[1].Type": "NO_CODE",
"document[1].ParametersRules[0].Features.feature1.op": ">",
"document[1].ParametersRules[0].Features.feature1.value": 10,
"document[1].ParametersRules[0].Features.feature2.op": "==",
"document[1].ParametersRules[0].Features.feature2.value": true,
"document[1].ParametersRules[0].Features.feature3.op": "range",
"document[1].ParametersRules[0].Features.feature3.value[0]": 4,
"document[1].ParametersRules[0].Features.feature3.value[1]": 10,
"document[1].ParametersRules[0].Features.feature4.op": "!=",
"document[1].ParametersRules[0].Features.feature4.value": "None",
"document[1].ParametersRules[0].DecisionType": "all",
"document[1].ParametersRules[1].Features.feature5.op": "<",
"document[1].ParametersRules[1].Features.feature5.value": 1000,
"document[1].ParametersRules[1].DecisionType": "any"
}"""
d: dict[str, str | bool | int | None] = json.loads(j)
result = process(d)
pprint(result)
Result:
[{'ClusterId': 22,
'Created': 1577880000000,
'Description': '',
'Enabled': True,
'MaxScore': 100,
'MinScore': 100,
'Modified': 1577880000000,
'Module': 'device',
'Name': 'os',
'RequiredReview': False,
'Rules': {'Rule1': False, 'Rule2': True},
'Type': 'NO_CODE'},
{'ClusterId': 29,
'Created': 1577880000000,
'Description': 'rule with params test',
'Enabled': True,
'MaxScore': 100,
'MinScore': 100,
'Modified': 1577880000000,
'Module': 'device',
'Name': 'rule with params test',
'ParametersRules[0]': {'DecisionType': 'all',
'Features': {'feature1': {'value': 10},
'feature2': {'op': '==', 'value': True},
'feature3': {'op': 'range',
'value[0]': 4,
'value[1]': 10},
'feature4': {'op': '!=', 'value': None}}},
'ParametersRules[1]': {'DecisionType': 'any',
'Features': {'feature5': {'value': 1000}}},
'RequiredReview': False,
'Rules': {'Rule': False},
'Type': 'NO_CODE'}]
Of course one of the problems is that it doesn't accounted for nested paths like document[1].ParametersRules[0].Features.feature1.op which should ideally create a new sub-list to add values to.

{'message': 'Forbidden'} instead of returning Dictionary

I'm making a CoinBasePro TradingBot and I've got some error with trying to make a position.
class Trader():
def __init__(self):
self.public = 'Public'
self.passphrase = 'Passphrase'
self.secret = 'Secret'
self.auth_client = cbpro.AuthenticatedClient(key= self.public, b64secret= self.secret, passphrase= self.passphrase)
Then, there is some RSI calculation function (float output) and direction function.
Next thing is to make a position function:
def make_pos(self, ticker):
rsi = self.RSI_calc(ticker)
direction = self.set_direction(ticker)
if rsi <= 42 and direction == 'buy':
position = self.auth_client.place_market_order(product_id= ticker, side=direction, size= self.QUANTITY)
print(position)
You know, it should output something like {
"id": "d0c5340b-6d6c-49d9-b567-48c4bfca13d2",
"price": "0.10000000",
"size": "0.01000000",
"product_id": "BTC-USD",
"side": "buy",
"stp": "dc",
"type": "limit",
"time_in_force": "GTC",
"post_only": false,
"created_at": "2016-12-08T20:02:28.53864Z",
"fill_fees": "0.0000000000000000",
"filled_size": "0.00000000",
"executed_value": "0.0000000000000000",
"status": "pending",
"settled": false
}
, but it returns {'message': 'Forbidden'}
Any hints please?
My thanks.

Add json data recursively to json object python

I have this initial json which I made in this piece of code:
menu_structure = []
menu_root = cursor.fetchall()
for menu_item in menu_root:
menu_structure.append({"name": menu_item[1]})
get_tree_branch(lang=lang, id_parent=menu_item[0], parent_name=menu_item[1], menu_structure=menu_structure,
cursor=cursor)
JSON 1
[
{
"name": "Fish",
"children": [
{
"name": "Fish of two waters"
},
{
"name": "Sea water fish"
},
{
"name": "Fresh water fish"
}
]
},
{
"name": "Seafood"
}
]
I made a sql petition to see if Fish of two waters has a children, if it has a children then I only receive a string with the name of children and I want to add the children to the json object like this:
JSON 2
[
{
"name": "Fish",
"children": [
{
"name": "Fish of two waters",
"children": [
{
"name": "Test fish"
}]
},
{
"name": "Sea water fish"
},
{
"name": "Fresh water fish"
}
]
},
{
"name": "Seafood"
}
]
To do that in my python code I do this:
def get_tree_branch(lang: str, id_parent: int, menu_structure, cursor):
sql_query = "select ps_category.id_category, name from ps_category Right Join ps_category_lang On " \
"ps_category.id_category = ps_category_lang.id_category Right Join ps_lang on " \
"ps_category_lang.id_lang = ps_lang.id_lang where ps_category.is_root_category = 0 " \
"AND ps_category.id_parent like %s AND ps_lang.lang_name = %s"
sql_params = (id_parent, lang)
cursor.execute(query=sql_query, args=sql_params)
menu_branch = cursor.fetchall()
if menu_branch:
menu_structure[len(menu_structure) - 1]["children"] = []
for menu_item in menu_branch:
menu_structure[len(menu_structure) - 1]["children"].append({"name": menu_item[1]})
get_tree_branch(lang=lang, id_parent=menu_item[0], menu_structure=menu_structure, cursor=cursor)
else:
return
But that code instead does this json:
JSON 3
[
{
"name": "Fish",
"children": [
{
"name": "Test fish"
},
{
"name": "Sea water fish"
},
{
"name": "Fresh water fish"
}
]
},
{
"name": "Seafood"
}
]
Any suggestions on how to create a json object like the JSON 2?
For those interested I did this, hope it helps
def get_tree_menu(lang: int):
test_db = init_database()
cursor = test_db.cursor()
menu_structure = []
sql_query = """select c.id_category, cl.name from ps_category_lang as cl
left join ps_category c on cl.id_category = c.id_category
where c.is_root_category = 1 AND cl.id_lang = %s AND c.active = 1
order by c.position asc"""
cursor.execute(query=sql_query, args=lang)
menu_root = cursor.fetchall()
id_category = 0
name = 1
for menu_item in menu_root:
menu_structure.append({"name": menu_item[name]})
get_tree_branch(lang=lang, id_parent=menu_item[id_category], parent_name=menu_item[name],
menu_structure=menu_structure, cursor=cursor)
cursor.close()
close_database(test_db=test_db)
return menu_structure
def get_tree_branch(lang: int, id_parent: int, parent_name: str, menu_structure, cursor):
sql_query = """select c.id_category, cl.name from ps_category_lang as cl
left join ps_category c on cl.id_category = c.id_category
where c.is_root_category = 0 AND c.id_parent = %s AND cl.id_lang = %s AND c.active = 1
order by c.position asc"""
sql_params = (id_parent, lang)
cursor.execute(query=sql_query, args=sql_params)
menu_branch = cursor.fetchall()
id_category = 0
name = 1
if menu_branch:
for menu_item in menu_branch:
menu_structure = create_json_structure(json_object=menu_structure, search_value=parent_name,
child_name=menu_item[name])
get_tree_branch(lang=lang, id_parent=menu_item[id_category], parent_name=menu_item[name],
menu_structure=menu_structure, cursor=cursor)
else:
return
def create_json_structure(json_object: any, search_value: str, child_name: str):
"""Recursively search for values of key in JSON tree and add a child to the JSON element."""
if isinstance(json_object, dict):
for key, value in json_object.items():
if isinstance(value, (dict, list)):
create_json_structure(json_object=value, search_value=search_value, child_name=child_name)
else:
if value == search_value:
try:
json_object["children"].append({
"parentName": search_value,
"name": child_name
})
except KeyError:
json_object["children"] = []
json_object["children"].append({
"parentName": search_value,
"name": child_name
})
break
elif isinstance(json_object, list):
for item in json_object:
create_json_structure(json_object=item, search_value=search_value, child_name=child_name)
return json_object

AWS Glue: How to expand nested Hive struct to Dict?

I'm trying to expand field mappings in a Table mapped by my AWS Glue crawler to a nested dictionary in Python. But, I can't find any Spark/Hive parsers to deserialize the
var_type = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
string located in table_schema['Table']['StorageDescriptor']['Columns'] to a Python dict.
How to dump the table definition in Glue:
import boto3
client = boto3.client('glue')
client.get_table(DatabaseName=selected_db, Name=selected_table)
Response:
table_schema = {'Table': {'Name': 'asdfasdf',
'DatabaseName': 'asdfasdf',
'Owner': 'owner',
'CreateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'UpdateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'LastAccessTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'Retention': 0,
'StorageDescriptor': {'Columns': [{'Name': 'version', 'Type': 'int'},
{'Name': 'payload',
'Type': 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'},
{'Name': 'origin', 'Type': 'string'}],
'Location': 's3://asdfasdf/',
'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
'Compressed': False,
'NumberOfBuckets': -1,
'SerdeInfo': {'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe',
'Parameters': {'paths': 'origin,payload,version'}},
'BucketColumns': [],
'SortColumns': [],
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'StoredAsSubDirectories': False},
'PartitionKeys': [{'Name': 'partition_0', 'Type': 'string'},
{'Name': 'partition_1', 'Type': 'string'},
{'Name': 'partition_2', 'Type': 'string'}],
'TableType': 'EXTERNAL_TABLE',
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'CreatedBy': 'arn:aws:sts::asdfasdf'},
'ResponseMetadata': {'RequestId': 'asdfasdf',
'HTTPStatusCode': 200,
'HTTPHeaders': {'date': 'Thu, 01 Aug 2019 16:23:06 GMT',
'content-type': 'application/x-amz-json-1.1',
'content-length': '3471',
'connection': 'keep-alive',
'x-amzn-requestid': 'asdfasdf'},
'RetryAttempts': 0}}
Goal would be a python dictionary and values for each field type, vs. the embedded string. E.g.
expand_function('struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'})
returns
{
'loc_lat':'double',
'service_handler':'string',
'ip_address':'string',
'device':'bigint',
'source':{'id':'string',
'contacts': {
'admin': {
'email':'string',
'name':'string'
},
'name':'string'
},
'loc_name':'string'
}
Thanks!
The accepted answer doesn't handle arrays.
This solution does:
import json
import re
def _hive_struct_to_json(hive_str):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
r = re.compile(r'(.*?)(struct<|array<|[:,>])(.*)')
root = dict()
to_parse = hive_str
parents = []
curr_elem = root
key = None
while to_parse:
left, operator, to_parse = r.match(to_parse).groups()
if operator == 'struct<' or operator == 'array<':
parents.append(curr_elem)
new_elem = dict() if operator == 'struct<' else list()
if key:
curr_elem[key] = new_elem
curr_elem = new_elem
elif isinstance(curr_elem, list):
curr_elem.append(new_elem)
curr_elem = new_elem
key = None
elif operator == ':':
key = left
elif operator == ',' or operator == '>':
if left:
if isinstance(curr_elem, dict):
curr_elem[key] = left
elif isinstance(curr_elem, list):
curr_elem.append(left)
if operator == '>':
curr_elem = parents.pop()
return root
hive_str = '''
struct<
loc_lat:double,
service_handler:string,
ip_address:string,
device:bigint,
source:struct<
id:string,
contacts:struct<
admin:struct<
email:string,
name:array<string>
>
>,
name:string
>,
loc_name:string,
tags:array<
struct<
key:string,
value:string
>
>
>
'''
hive_str = re.sub(r'[\s]+', '', hive_str).strip()
print(hive_str)
print(json.dumps(_hive_struct_to_json(hive_str), indent=2))
Prints:
struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:array<string>>>,name:string>,loc_name:string,tags:array<struct<key:string,value:string>>>
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": [
"string"
]
}
},
"name": "string"
},
"loc_name": "string",
"tags": [
{
"key": "string",
"value": "string"
}
]
}
Here's a function running on the embedded Hive struct string above.
def _hive_struct_to_json(hive_struct):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
# Convert embedded hive type definition string to JSON
hive_struct = hive_struct.replace(':', '":"')
hive_struct = hive_struct.replace(',', '","')
hive_struct = hive_struct.replace('struct<', '{"')
hive_struct = hive_struct.replace('"{"', '{"')
hive_struct = hive_struct.replace('>', '"}')
hive_struct = hive_struct.replace('}"', '}')
return json.loads(hive_struct)
hive_str = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
print(json.dumps(_hive_struct_to_json(hive_str),indent=2))
Returns:
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": "string"
}
},
"name": "string"
},
"loc_name": "string"
}
I tried to scout from some existing ways and found some helper functions from pyspark.
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tmp").getOrCreate()
struct_map = T._parse_datatype_string("MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>")
struct_map is a pyspark type that inturn has nested fields to iterate over. Once you have an object like above, performing a recursive call to flatten it should be easy. I am open to hearing opinions from others about this approach.

Categories

Resources