Performance problem when using pandas apply on big dataframes - python

Im having some performance issues with the code below, mostly because of the apply function that im using on a huge dataframe. I want to update the semi_dict dictionary with some other data that im calculating with the some functions. Is it any way to improve this?
def my_function_1(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data"].append(random_dict)
def my_function_2(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data2"].append(random_dict)
dictionary_list = []
for v in values:
df_1_rows = df_1_rows[(df_1_rows.values == v)]
df_2_rows = df_2_rows[(df_2_rows.values == v)]
semi_dict = dict(value=v, data=[], data2=[])
function = partial(my_function_1, semi_dict)
function_2 = partial(my_function_2, semi_dict)
df_1_rows.apply(lambda row : function(row), axis=1)
df_2_rows.apply(lambda row : function_2(row), axis=1)
dictionary_list.append(semi_dict)

This answer uses dictionary merge from How to merge dictionaries of dictionaries?, but depending on your use case, you might not need it in the end:
import pandas as pd
import random
len_df = 10
row_values = list("ABCD")
extra_col_values = list("12345")
df_1 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col1', 'extra1'])
df_2 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col2', 'extra2'])
def make_dict(df):
# some calculations on the df
return {
'data': df.head(1).values.tolist(),
}
def make_dict_2(df):
# some calculations on the df
return {
'data_2': df.head(1).values.tolist(),
}
def merge(a, b, path=None):
"merges b into a, taken from https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries "
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
dict1 = df_1.groupby('col1').apply(make_dict).to_dict()
dict2 = df_2.groupby('col2').apply(make_dict_2).to_dict()
result = merge(dict1, dict2)
result

Related

What's the best method to create a dictionary from outputs of multiple for loops

This is my code:
def get_coin_tickers(url):
req = requests.get(url)
# print(req.text)
resp = json.loads(req.text)
# print(resp.text)
return resp
pair_a_list = ["BTC_USDT", "EOS_USDT", "ETH_USDT"]
pair_b_list = ["SOL_USDT", "MATIC_USDT", "SUSHI_USDT"]
for pair_a in pair_a_list:
orderbook_url = f'https://api.pionex.com/api/v1/market/depth?symbol={pair_a}&limit=5'
pair_a_prices_json = get_coin_tickers(orderbook_url)
pair_a_ask = pair_a_prices_json['data']['asks'][0][0]
pair_a_bid = pair_a_prices_json['data']['bids'][0][0]
for pair_b in pair_b_list:
orderbook_url = f'https://api.pionex.com/api/v1/market/depth?symbol={pair_b}&limit=5'
pair_a_prices_json = get_coin_tickers(orderbook_url)
pair_b_ask = pair_a_prices_json['data']['asks'][0][0]
pair_b_bid = pair_a_prices_json['data']['bids'][0][0]
keys = ['pair_a', 'pair_a_ask', 'pair_a_bid', 'pair_b', 'pair_b_ask', 'pair_b_bid']
values = [pair_a, pair_a_ask, pair_a_bid, pair_b, pair_b_ask, pair_b_bid]
mydict = {k: v for (k, v) in zip(keys, values)}
print(mydict)
I'm able to create a Dictionary but with only one Symbol-Pairs from each list. Which seems to be outputs from only the last symbol pairs of both the lists:
{'pair_a': 'ETH_USDT', 'pair_a_ask': '1254.18', 'pair_a_bid': '1253.51', 'pair_b': 'SUSHI_USDT', 'pair_b_ask': '0.9815', 'pair_b_bid': '0.9795'}
I'm expecting to see a combined dictionary with values of both lists as keys (with their API-Values) in the final list (after) iterating through both lists using for Loops
Found a solution to my question based on suggestion from #JonSG, plus adapted to the suggestion. I've included the code below for others to use as and if the need arises.
for pair_a_list, pair_b_list, pair_c_list in zip(pair_a_list, pair_b_list, pair_c_list):
orderbook_a_url = f'https://api.pionex.com/api/v1/market/depth?symbol={pair_a_list}&limit=1'
pair_a_prices_json = get_coin_tickers(orderbook_a_url)
pair_a_ask = pair_a_prices_json['data']['asks'][0][0]
pair_a_bid = pair_a_prices_json['data']['bids'][0][0]
my_dict_a = {
'pair_a_ask': pair_a_ask,
'pair_a_bid': pair_a_bid
}
orderbook_b_url = f'https://api.pionex.com/api/v1/market/depth?symbol={pair_b_list}&limit=1'
pair_b_prices_json = get_coin_tickers(orderbook_b_url)
pair_b_ask = pair_b_prices_json['data']['asks'][0][0]
pair_b_bid = pair_b_prices_json['data']['bids'][0][0]
my_dict_b = {
'pair_b_ask': pair_b_ask,
'pair_b_bid': pair_b_bid
}
orderbook_c_url = f'https://api.pionex.com/api/v1/market/depth?symbol={pair_c_list}&limit=1'
pair_c_prices_json = get_coin_tickers(orderbook_c_url)
pair_c_ask = pair_c_prices_json['data']['asks'][0][0]
pair_c_bid = pair_c_prices_json['data']['bids'][0][0]
my_dict_c = {
'pair_c_ask': pair_c_ask,
'pair_c_bid': pair_c_bid
}
# (Use either option below.)
# my_dict = {**my_dict_a, **my_dict_b, **my_dict_c}
# my_dict = my_dict_a | my_dict_b | my_dict_c

Recursive relations search between 2 columns in a table [Using python list / Dict]

I am trying to optimize a solution that I created to find recursive relations between 2 columns in a table. I need to find all accIDs for a bssID and recursively find all the bssIDs for those accIDs and so on till I find all the related bssIDs.
bssIDs
accIDs
ABC
4424
ABC
56424
ABC
2383
A100BC
2383
A100BC
4943
A100BC
4880
A100BC
6325
A100BC
4424
XYZ
123
The below solution works for an initial table of 100K rows but the below solution runs for >16 hours for a dataset of 20 million rows. I am trying to use dicts instead of list but I am unable to change the dict while iterating over the same as I am with a list.
import time
accIds = {4880: ['A100BC'], 6325: ['A100BC'], 2383: ['A100BC','ABC'],4424: ['A100BC','ABC'], 4943: ['A100BC'], 56424: ['ABC'],123: ['XYZ']}
bssIds = {'ABC': [4424,56424,2383], 'A100BC': [2383,4943,4880,6325,4424], 'XYZ':[123]}
def findBIDs(aID):
return accIds[aID]
def findAIDs(bID):
return bssIds[bID]
def getList(Ids):
return Ids.keys()
def checkList(inputList, value):
return (value in inputList)
def addToList(inputList, value):
return inputList.append(value)
def removeFromList(inputList, value):
return inputList.remove(value)
aIDlist = list(getList(accIds))
bIDlist = list(getList(bssIds))
bRelations = {}
runningList = list()
for x in bIDlist:
if not checkList(runningList,x):
aList = list()
bList = list()
addToList(bList, x)
for y in bList:
for c in findAIDs(y):
if not checkList(aList, c):
addToList(aList, c)
for z in aList:
for a in findBIDs(z):
if not checkList(bList, a):
addToList(bList, a)
bRelations.update({time.time_ns(): bList})
runningList.extend(bList)
print(bRelations)
Output : {1652374114032173632: ['ABC', 'A100BC'], 1652374114032180888: ['XYZ']}
Please suggest if there is a way to update a dict while iterating over it or If we can apply a recursive solution for the same.
This is the fastest I could think of:
accIds = {4880: frozenset(['A100BC']), 6325: frozenset(['A100BC']), 2383: frozenset(['A100BC','ABC']),4424: frozenset(['A100BC','ABC']), 4943: frozenset(['A100BC']), 56424: frozenset(['ABC']),123: frozenset(['XYZ'])}
bssIds = {'ABC': frozenset([4424,56424,2383]), 'A100BC': frozenset([2383,4943,4880,6325,4424]), 'XYZ':frozenset([123])}
def search_bssid(bssId):
traversed_accIds = set()
traversed_bssIds = {bssId}
accIds_to_check = []
bssIds_to_check = [bssId]
while bssIds_to_check:
bssId = bssIds_to_check.pop()
new_accids = bssIds[bssId] - traversed_accIds
traversed_accIds.update(new_accids)
accIds_to_check.extend(new_accids)
while accIds_to_check:
accId = accIds_to_check.pop()
new_bssids = accIds[accId] - traversed_bssIds
traversed_bssIds.update(new_bssids)
bssIds_to_check.extend(new_bssids)
return traversed_bssIds
print(search_bssid("ABC"))

How can I simplify this function, random pandas DF selection adding to dictionary?

How can I simplify this function I am trying to create? I would like to pull data from a csv. Turn it into a Dataframe, randomly select a choice, add that choice to a corresponding dictionary key value pair.
def generate_traits():
import pandas as pd
df_bonds = pd.read_csv('/file/location_1')
df_alignments = pd.read_csv('/file/location_2')
df_faiths = pd.read_csv('/file/location_3')
df_flaws = pd.read_csv('/file/location_4')
df_ideals = pd.read_csv('/file/location_5')
df_lifestyles = pd.read_csv('/file/location_6')
df_organizations = pd.read_csv('/file/location_7')
df_personalities = pd.read_csv('/file/location_8')
df_names = pd.read_csv("/file/location_9")
random_bond = df_bonds.sample(1)
random_alignment = df_alignments.sample(1)
random_faith = df_faiths.sample(1)
random_flaw = df_flaws.sample(1)
random_ideal = df_ideals.sample(1)
random_lifestyle = df_lifestyles.sample(1)
random_organization = df_organizations.sample(1)
random_personaltiy = df_personalities.sample(1)
random_name = df_names.sample(1)
traits_dict={"Name:": random_name.iloc[0,0],
"Alignment:": random_alignment.iloc[0,0],
"Bond:":random_bond.iloc[0,0],
"Religion:":random_faith.iloc[0,0],
"Flaw:":random_flaw.iloc[0,0],
"Ideal:":random_ideal.iloc[0,0],
"Lifestyle:":random_lifestyle.iloc[0,0],
"Organization:":random_organization.iloc[0,0],
"Personality:":random_personaltiy.iloc[0,0]}
return traits_dict
The function does behave as expected however, I know there must be a way to loop through this I just have not found any way to do so.
You can chain your operations:
import pandas as pd
def generate_traits():
return {'Name': pd.read_csv('/file/location_1').sample(1).iloc[0,0],
'Alignment:': pd.read_csv('/file/location_2').sample(1).iloc[0,0],
'Bond': pd.read_csv('/file/location_3').sample(1).iloc[0,0],
'Religion': pd.read_csv('/file/location_4').sample(1).iloc[0,0],
'Flaw': pd.read_csv('/file/location_5').sample(1).iloc[0,0],
'Ideal': pd.read_csv('/file/location_6').sample(1).iloc[0,0],
'Lifestyle': pd.read_csv('/file/location_7').sample(1).iloc[0,0],
'Organization': pd.read_csv('/file/location_8').sample(1).iloc[0,0],
'Personality': pd.read_csv('/file/location_9').sample(1).iloc[0,0]}
def generate_traits():
import pandas as pd
name_location = {'Bond': 'location_1'
'Alignment': 'location_2'
'Religion': 'location_3'
'Flaw': 'location_4'
'ideals': 'location_5'
'Lifestyle': 'location_6'
'Organization': 'location_7'
'Personality': 'location_8'
'Name': 'location_9'}
all_df = {name: pd.read_csv(f'/file/{loc}') for name, loc in name_location.items()}
traits_dict = {name: df.sample(1).iloc[0, 0] for name, df in all_df.items()}
return traits_dict

'itertools._grouper' object has no attribute 'user'

Why can't I convert the loop group in groupby as list? Currently, I am working on Django==2.2.1 and when I try this data = [...] below into python console, it is working fine.
from itertools import groupby
from operator import itemgetter
#login_required
def list(request, template_name='cart/list.html'):
# I also try with this dummy data
test_data = [{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':7.0}]
print(type(data)) # a list
sorted_totals = sorted(test_data, key=itemgetter('total_order'))
for agent_name, group in groupby(sorted_totals, key=lambda x: x['agent_name']):
print(agent_name, list(group)) # I stopped here when converting the `group` as list.
But, I am getting an error looking like this when I try it at views in Django.
I also tried it with defaultdict
from collections import defaultdict
#login_required
def list(request, template_name='cart/list.html'):
test_data = [{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':7.0}]
grouped = defaultdict(list)
for data_total in test_data:
grouped[data_total['agent_name']].append(data_total) # stoped here
grouped_out = []
for agent_name, group in grouped.items():
total_order = 0
total_pcs = 0
total_kg = 0
if isinstance(group, list):
for data_total in group:
total_order += data_total.get('total_order')
total_pcs += data_total.get('total_pcs')
total_kg += data_total.get('total_kg')
grouped_out.append({
'agent_name': agent_name,
'total_order': total_order,
'total_pcs': total_pcs,
'total_kg': total_kg
})
But the error I found stoped by wrapper view. If we following the previous issue, it referenced with this _wrapped_view
Finally, I fixed it manually by using a dict.
test_data = [{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':7.0}]
grouped = {}
for data_total in test_data:
agent_name = data_total.get('agent_name')
if agent_name in grouped:
new_data = grouped[agent_name] # dict
new_data['total_order'] += data_total.get('total_order')
new_data['total_pcs'] += data_total.get('total_pcs')
new_data['total_kg'] += data_total.get('total_kg')
grouped[agent_name].update(**new_data)
else:
grouped[agent_name] = data_total
And the result of grouped is look like this:
{'agent123': {'agent_name': 'agent123',
'total_kg': 18.0,
'total_order': 3,
'total_pcs': 3},
'agentbeli': {'agent_name': 'agentbeli',
'total_kg': 17.0,
'total_order': 3,
'total_pcs': 3}}

Construct python dict from DeepDiff result

I have a DeepDiff result which is obtained by comparing two JSON files. I have to construct a python dictionary from the deepdiff result as follows.
json1 = {"spark": {"ttl":3, "poll":34}}
json2 = {"spark": {"ttl":3, "poll":34, "toll":23}, "cion": 34}
deepdiffresult = {'dictionary_item_added': {"root['spark']['toll']", "root['cion']"}}
expecteddict = {"spark" : {"toll":23}, "cion":34}
How can this be achieved?
There is probably a better way to do this. But you can parse the returned strings and chain together a new dictionary with the result you want.
json1 = {"spark": {"ttl":3, "poll":34}}
json2 = {"spark": {"ttl":3, "poll":34, "toll":23}, "cion": 34}
deepdiffresult = {'dictionary_item_added': {"root['spark']['toll']", "root['cion']"}}
added = deepdiffresult['dictionary_item_added']
def convert(s, j):
s = s.replace('root','')
s = s.replace('[','')
s = s.replace("'",'')
keys = s.split(']')[:-1]
d = {}
for k in reversed(keys):
if not d:
d[k] = None
else:
d = {k: d}
v = None
v_ref = d
for i, k in enumerate(keys, 1):
if not v:
v = j.get(k)
else:
v = v.get(k)
if i<len(keys):
v_ref = v_ref.get(k)
v_ref[k] = v
return d
added_dict = {}
for added_str in added:
added_dict.update(convert(added_str, json2))
added_dict
#returns:
{'cion': 34, 'spark': {'toll': 23}}
Simple Answer,
in python have a in-build called Dictdiffer function. can you try this.
$ pip install dictdiffer
Examples:
from dictdiffer import diff
result = diff(json1, json2)
print result == {"spark" : {"toll":23}, "cion":34}
References:
DictDiffer

Categories

Resources