Keep of level of nested keys for recursive dict function? - python

I have a recursive function that grabs all keys from a python dictionary no matter what level of nesting. While this works great, I am trying to keep track of the level of nesting for each key at the same time. Some kind of counter, but not sure how to implement it. Below is what I have so far:
d = {"12": "a",
"3": "b",
"8": {
"12": "c",
"25": "d"
}
}
keys_list = []
def iterate(dictionary):
for key, value in dictionary.items():
if key not in keys_list:
keys_list.append(key)
if isinstance(value,dict):
iterate(value)
continue
iterate(d)
This returns:
keys_list = ['12', '3', '8', '25']
Right now the nested "12" is being ignored because it is already in the list, but I need some sort of unique identifier for the second 12 so it is included too. Any thoughts?

You could add a depth argument to your recursive function:
d = {"12": "a",
"3": "b",
"8": {
"12": "c",
"25": "d"
}
}
keys_list = []
def iterate(dictionary, depth=0):
for key, value in dictionary.items():
if key not in keys_list:
keys_list.append((key, depth))
if isinstance(value,dict):
depth += 1
iterate(value, depth)
continue
iterate(d)
print(keys_list)
Output:
[('12', 0), ('3', 0), ('8', 0), ('12', 1), ('25', 1)]
This gives a list of tuples, where the first value in each tuple is the key, and the second value is the depth.
EDIT
the code below should cover different cases more reliably than the code above (but I did change your iterate function somewhat):
d = {"12": "a",
"3": "b",
"8": {
"12": "c",
"25": "d"
},
"test":"a"
}
KEYS = []
DEPTH = 0 # keep counter updated globally too
def iterate(dictionary):
global DEPTH
for key, value in dictionary.items():
KEYS.append((key, DEPTH))
if isinstance(value, dict):
DEPTH += 1
iterate(value)
DEPTH = 0
iterate(d)
print(KEYS)
Output:
[('12', 0), ('3', 0), ('8', 0), ('12', 1), ('25', 1), ('test', 0)]

Related

Return the biggest but no more than a given float number using a given list of floats

This year, our school is holding a tug of war competition between classes, me and my classmates are eager to join, but we're not sure who to join the race, so I decided to code a program about it.
Here is my attempt with recursion, but it can only handle integer inputs, but the weight that my teacher gave us are float numbers. Is there any way to find the biggest weight combination but no more than the weight class (target)? Also, there is a participant limit, the number participants should be exactly the same as participants_needed.
def allPossibleCombinations(target: int, using: dict[str: int], stack = None):
# initialize
if stack is None:
stack = []
using = dict(sorted(using.items(), key = lambda x: x[1])) # sort the dict using the values
this_layer = []
previous_layer = []
if (using != {}) and (target - min(using.values()) >= 0):
index = 0
for name, number in using.items():
if target - number == 0:
this_layer.append(stack + [name])
elif target - number < 0:
break
else:
previous_layer.extend(allPossibleCombinations(target - number, dict(list(using.items())[index + 1:]), stack + [name]))
index += 1
return previous_layer + this_layer
def findCloset(target: int, participants_needed: int, participants: dict[str: int]): # if the target is impossible to form, try target - 1, target - 2 ...
for i in range(target, min(participants.values()), -1):
temp = allPossibleCombinations(i, participants)
if temp != []:
result = []
for item in temp:
if len(item) == participants_needed:
result.append(item)
if result != []:
print(f"{i}:", *result)
break
else:
print("Impossible")
For example, ("A", "B", "C"... are placeholder names)
findCloset(460, 8, {"A": 53, "B": 71, "C": 42, "D": 58, "E": 49, "F": 75, "G": 56, "H": 46, "I": 49, "J": 51}) -> 459: ['H', 'E', 'J', 'A', 'G', 'D', 'B', 'F'], ['H', 'I', 'J', 'A', 'G', 'D', 'B', 'F']
The thing that I want: (Handle floats)
findCloset(460, 8, {"A": 53.4, "B": 71.0, "C": 42.3, "D": 57.6, "E": 48.9, "F": 75.1, "G": 56.2, "H": 46.1, "I": 48.7, "J": 51.3}) -> ???
Here's an example using combinations that works with float also:
from itertools import combinations
def find_closest(target, participants_needed, participants):
players = [x for x in participants.keys()]
player_combinations = combinations(players, participants_needed)
allowed_combinations = []
for comb in player_combinations:
total_weight = sum(participants[x] for x in comb)
if total_weight <= target:
allowed_combinations.append([total_weight, comb])
return sorted(allowed_combinations, key=lambda x: x[0], reverse=True)
When:
result = find_closest(t, pn, p)
You can get, for example, top 3:
for i in result[:3]:
print(f"Weight: {i[0]}; Players: {i[1]}")
Or just the highest:
print(f"Weight: {result[0][0]}; Players: {result[0][1]}")
Result for your example:
Weight: 459.6; Players: ('A', 'B', 'D', 'E', 'F', 'G', 'H', 'J')
The range function does not accept floats as arguments. You can cast the float value to the closest smaller/bigger integer using math.floor()/math.ceil() respectively. You have to import the math module using import math at the top of the file then change the first line in findClosest to:
for i in range(target, math.floor(min(participants.values())), -1):

How to speed up successive pd.apply with successive pd.DataFrame.loc calls?

def __link_price(row: pd.Series) -> Union[None, float]:
if (row['fund'] == 'A') and (row['share_class'] == 'X'):
return df_hist.loc[row['date'], 'AA']
elif (row['fund'] == 'A') and (row['share_class'] == 'Y'):
return df_hist.loc[row['date'], 'AB']
elif (row['fund'] == 'B') and (row['share_class'] == 'X'):
return df_hist.loc[row['date'], 'BA']
elif (row['fund'] == 'B') and (row['share_class'] == 'Y'):
return df_hist.loc[row['date'], 'BB']
elif (row['fund'] == 'C') and (row['share_class'] == 'X'):
return df_hist.loc[row['date'], 'CA']
elif (row['fund'] == 'C') and (row['share_class'] == 'Y'):
return df_hist.loc[row['date'], 'CB']
else:
return 0
df.loc[:, 'price'] = df.apply(__link_price_at_purchase, axis=1).values
df has 10,000+ lines, so this code is taking a long time. In addition for each row, I'm doing a df_hist.loc call to get the value.
I'm trying to speed up this section of code and then option I've found so far is using:
df.loc[:, 'price'] = df.apply(__link_price_at_purchase, axis=1, raw=True).values
But this forces me to use index based selection for row instead of value selection:
if (row[0] == 'A') and (row[1] == 'X')
which reduces the readability of the code.
I'm looking for an approach that both speeds up the code and still allows for readability of the code.
In python, there is a certain cost for each attribute or item lookup and function call. And you don't have a compiler that optimizes things for you.
Here are some general recommendations:
Try creating a column that includes fund and share_class without using Python functions and then merge it with df_hist
# convert history from 'wide' format into 'long' format
hist = df_hist.set_index("date").stack()
prices = (
# create key column for join
df.assign(key=df["fund"] + df["share_class"].replace({"X": "A", "Y": "B"}))
.set_index(["date", "key"])
.join(hist) # join by index
)
If it's not trivial to create a key column, minimize attribute lookups inside the apply function:
def __link_price(row):
date, fund, share_class = row[["date", "fund", "share_class"]]
if fund == 'A' and share_class == 'X':
return df_hist.loc[date, 'AA']
...
Optimize if conditions. For example, you need to check 6 conditions in case where (row['fund'] == 'C') and (row['share_class'] == 'Y'). You can reduce this number to ... 1.
fund_and_share_class_to_key = {
("A", "X"): "AA",
("A", "Y"): "AB",
...
}
key = fund_and_share_class_to_key.get((fund, share_class))
return df_hist.loc[date, key] if key is not None else 0
Pandas itself is pretty slow for non-vectorized and non-arithmetic operations. In your case it's better to use standard python dicts for faster lookups.
# small benchmark
df = pd.DataFrame({"value": [4,5,6]})
d = df.to_dict(orient="index")
%timeit df.loc[1, "value"] # 8.7ms
%timeit d[1]["value"] # 50ns; ~170 times faster
# convert dataframe into the dict with format:
# {<date>: {"AA": <value>}}
history = df_hist.set_index("date").to_dict(orient="index")
def __link_price(row):
...
price = history.get(date, {}).get(key, 0)
return price
It should be faster to pass history as an apply argument rather than search it in the non-local scope. It also makes the code cleaner.
def __link_price(row, history):
...
df.apply(__link_price, args=(history, ))
To summarize, a faster function would be something like this:
history = df_hist.set_index("date").to_dict(orient="index")
# we don't need to create a mapping on every __link_price call
fund_and_share_class_to_key = {
("A", "X"): "AA",
("A", "Y"): "AB",
...
}
def __link_price(row, history, fund_and_share_class_to_key):
date, fund, share_class = row[["date", "fund", "share_class"]]
key = fund_and_share_class_to_key.get((fund, share_class))
return history.get(date, {}).get(key, 0)
df.apply(__link_price, args=(history, fund_and_share_class_to_key))

Limit number of items / length of json for logging

I am working on an API that returns JSON. I am logging my responses, and sometimes the JSON is just absurdly long and basically clogs my log files. Is there a neat way to reduce the length of a JSON, purely for visually logging the data? (not in effect in production)
The basic approach is to reduce arrays over a length of 5 to [first 2, "...", last 2], and dictionaries with more than 4 items to {first 4, "..." : "..."}
The code below is ugly. I am aware that it should be a recursive solution that reduces the items in the same way for a JSON of arbitrary depth - it currently only does so for depth 2.
def log_reducer(response_log):
original_response_log = response_log
try:
if type(response_log) == dict:
if len(response_log) >= 4: # {123456}
response_log = dict(list(response_log.items())[:4])
response_log.update({"...": "..."}) # {1234...}
for key, value in response_log.items():
if type(value) == list:
if len(value) >= 5: # {key:[123456]}
new_item = value[:2] + ['...'] + value[-2:] # {[12...56]}
response_log.update({key: new_item})
if type(value) == dict:
if len(value) >= 4: # {key:{123456}}
reduced_dict = dict(list(value.items())[:4])
reduced_dict.update({"...": "..."})
response_log.update({key: reduced_dict}) # {{1234...}}
elif type(response_log) == list:
if len(response_log) >= 5: # [123456]
response_log = response_log[:2] + ['...'] + response_log[-2:] # [12...56]
for inner_item in response_log:
if type(inner_item) == list:
if len(inner_item) >= 5: # [[123456]]
reduced_list = inner_item[:2] + ['...'] + inner_item[-2:] # [[12...56]]
response_log.remove(inner_item)
response_log.append(reduced_list)
if type(inner_item) == dict:
if len(inner_item) >= 4: # [{123456}]
reduced_dict = dict(list(inner_item.items())[:4])
reduced_dict.update({"...": "..."}) # [{1234...}]
response_log.remove(inner_item)
response_log.append(reduced_dict)
except Exception as e:
return original_response_log
return response_log
The returned response_log is then logged with logger.info(str(response_log))
As you can see, the fact that there can be either arrays or dictionaries at every level makes this task a little more complex, and I am struggling to find a library or code snipped of any kind which would simplify this. If anyone wants to give it a shot, I would appreciate it a lot.
you can use a test JSON like this to see it in effect:
test_json = {"works": [1, 2, 3, 4, 5, 6],
"not_affected": [{"1": "1", "2": "2", "3": "3", "4": "4", "5": "5"}],
"1": "1", "2": "2", "3": "3",
"removed": "removed"
}
print("original", test_json)
reduced_log = log_reducer(test_json)
print("reduced", reduced_log)
print("original", test_json)
reduced_log = log_reducer([test_json]) # <- increases nesting depth
print("reduced", reduced_log)
This answer uses #calceamenta's idea, but implements the actual cutting-down logic:
def recursive_reduce(obj):
if isinstance(obj, (float, str, int, bool, type(None))):
return obj
if isinstance(obj, dict):
keys = list(sorted(obj))
obj['...'] = '...'
if len(keys) > 5:
new_keys = keys[:2] + ["..."] + keys[-2:]
else:
new_keys = keys
new_dict = {x:obj[x] for x in new_keys}
for k, v in new_dict.items():
new_dict[k] = recursive_reduce(v)
return new_dict
if isinstance(obj, list):
if len(obj) > 5:
new_list = obj[:2] + ["..."] + obj[-2:]
else:
new_list = obj
for i, v in enumerate(new_list):
new_list[i] = recursive_reduce(v)
return new_list
return str(obj)
test_json = {"works": [1, 2, 3, 4, 5, 6],
"not_affected": [{"1": "1", "2": "2", "3": "3", "4": "4", "5": "5"}],
"1": "1", "2": "2", "3": "3",
"removed": "removed"
}
print("original", test_json)
reduced_log = recursive_reduce(test_json)
print("reduced", reduced_log)
Output:
original {'works': [1, 2, 3, 4, 5, 6], 'not_affected': [{'1': '1', '2': '2', '3': '3', '4': '4', '5': '5'}], '1': '1', '2': '2', '3': '3', 'removed': 'removed'}
reduced {'1': '1', '2': '2', '...': '...', 'removed': 'removed', 'works': [1, 2, '...', 5, 6]}
Hope this helps :)
You can overwrite the string representation of dicts and lists in python using the def __str__(): method. Using this just recursively call the print function on all elements. It can have a simple boilerplate like this:
def custom_print(obj):
log_str = ''
if type(obj) == list:
for item in obj:
log_str += custom_print(item)
elif type(obj) == dict:
for k, item in obj.items():
custom_print(item)
Use this custom log function to print into your log file as per your log file format.

Mapping of user accounts to dates and sorted list of dates return a mapping of the indexes the dates by user are in the sorted dates list efficently

I am duplicating Facebook's chat read receipt system. I wrote some basic code I think works. However my boss thinks it would be slow. I have no algorithms training. What is the most efficient way to return a mapping of indexes to numbers where the numbers are between two numbers in a sorted list and the index is the index of the first number in the between pair?
# Given say {"a": 3, "b": 10, "c": 7, "d": 19} and [1,5,15] return {0: ["a"], 1: ["b", "c"], 2: ["d"]}
def find_read_to(read_dates, message_dates):
read_indexes_to_user_ids = {}
for user_id in read_dates:
for i, date in enumerate(message_dates):
last_index = i + 1 == len(message_dates)
next_index = -1 if last_index else i + 1
if last_index or (read_dates[user_id] >= date and read_dates[user_id] < message_dates[next_index]):
if i in read_indexes_to_user_ids:
read_indexes_to_user_ids[i].append(user_id)
else:
read_indexes_to_user_ids[i] = [user_id]
break
return read_indexes_to_user_ids
find_read_to({"a": 3, "b": 10, "c": 7, "d": 19}, [1,5,15])
Version using bisect module
import bisect
def find_read_to(read_dates, message_dates):
read_indexes_to_user_ids = {}
user_ids, read_dates = zip(*read_dates.items())
def find_between(read_date):
answer = bisect.bisect_left(message_dates, read_date)
answer -= 1
if answer == -1:
return None
return answer
indexes_for_read_up_to = map(find_between, read_dates)
for i, index_for_read_up_to in enumerate(indexes_for_read_up_to):
user_id = user_ids[i]
if index_for_read_up_to is None:
continue
if index_for_read_up_to in read_indexes_to_user_ids:
read_indexes_to_user_ids[index_for_read_up_to].append(user_id)
else:
read_indexes_to_user_ids[index_for_read_up_to] = [user_id]
return read_indexes_to_user_ids
find_read_to({"a": 3, "b": 10, "c": 7, "d": 19}, [1,5,15])

Modifying nested dictionaries

Given this two dicts:
empty = {'151': {'1': 'empty', '0': 'empty', '2': '2.30'}}
full = {'151': {'1': 3.4, '0': 3.6, '2': 2}}
Firstly, I want to check if empty.keys() == full.keys() if it holds, I want to replace the empty values with corresponding value from full dictionary. It ought to result in:
not_empty = {'151': {'1': '3.4', '0': '3.6', '2': '2.30'}}
My solution so far: I thought I would identify all the keys with empty values using regex, but for whatever reason my code so far, produces an empty dict {}.
import re
find_empty = re.findall("'(\d)':\s'empty'", str(empty))[0]
if empty.keys() == full.keys():
k = empty.values()[0].keys()
v = empty.values()[0].values()
print {k:v for k,v in empty.values()[0].iteritems()\
if empty.values()[0][find_empty] != 'empty'}
I hoped it can output {'151': {'2': '2.30'}} for a good starting point. Anyway, I guess there exists more clean solution then regex for this task so any hints are welcomed!
Regex is not the right tool for this job. I would suggest a recursive approach like the following.
empty = {'151': {'1': 'empty', '0': 'empty', '2': '2.30'}}
full = {'151': {'1': 3.4, '0': 3.6, '2': 2}}
def repl(a, b):
clean = {}
for k, v in a.items():
# This is the case where we want to replace what we have in b if we have something. Just in case, use the dict.get method and provide a default.
if v == 'empty':
clean[k] = b.get(k, 'Not there')
# If the value is another dict, then call this function with the value, and put the return as the value for our current key
elif isinstance(v, dict):
v_clean = repl(v, b.get(k, {}))
clean[k] = v_clean
# The value isn't equal to 'empty', and it isn't another dict, so just keep the current value.
else:
clean[k] = v
# Finally, return the cleaned up dictionary.
return clean
print repl(empty, full)
OUTPUT
{'151': {'1': 3.4, '0': 3.6, '2': '2.30'}}
EDIT I am not sure if this takes care of all of your cases, but it probably worth a look anyway.
empty = {'151': {'1': 'empty', '0': 'empty', '2': '2.30', '8': ['empty', 'empty', 5, {"foo2": "bar2", "1": "empty"}]}}
full = {'151': {'1': 3.4, '0': 3.6, '2': 2, '8': ['foo', 'bar', 'baz', {"foo3": "bar3", "1": "2"}]}}
def repl(a, b):
if isinstance(a, dict) and isinstance(b, dict):
clean = {}
for k, v in a.items():
# This is the case where we want to replace what we have in b if we have something. Just in case, use the dict.get method and provide a default.
if v == 'empty':
clean[k] = b.get(k, 'Not there')
# If the value is another dict, then call this function with the value, and put the return as the value for our current key
elif isinstance(v, dict):
v_clean = repl(v, b.get(k, {}))
clean[k] = v_clean
# The value isn't equal to 'empty', and it isn't another dict, so just keep the current value.
elif isinstance(v, list):
v_clean = repl(v, b.get(k, []))
clean[k] = v_clean
else:
clean[k] = v
# Finally, return the cleaned up dictionary.
elif isinstance(a, list) and isinstance(b, list):
clean = []
for item_a, item_b in zip(a, b):
if item_a == 'empty':
clean.append(item_b)
elif isinstance(item_a, dict):
clean_a = repl(item_a, item_b)
clean.append(clean_a)
else:
clean.append(item_a)
return clean
print repl(empty, full)
OUTPUT
{'151': {'1': 3.4, '0': 3.6, '2': '2.30', '8': ['foo', 'bar', 5, {'1': '2', 'foo2': 'bar2'}]}}

Categories

Resources