Items combination and frequency count in Pandas - python

I have Dataset like this:
ORDER_CODE
ITEM_ID
ITEM_NAME
TOTALPRICE
123
id1
name1
345
321
id2
name2
678
and Function for calculation which items was sold together. Which ones was most popular or more expensive
out:
ITEM_ID
sold together
id1
[ id33, id23, id12 ]
id2
[ id56, id663 ]
I using this Func:
def freq(df):
hit_list = [list of ID's]
result = pd.DataFrame(columns = ['ITEM_ID', 'sold together'])
unic_arc = df['ITEM_ID'].unique()
unic_num = df['ORDER_CODE'].unique()
data_arc ={}
data_num={}
for i in unic_arc:
data_arc[i] = {}
tturns = response_ur[['ITEM_ID', 'TOTALPRICE']].groupby(by = 'ITEM_ID', as_index = False).sum()
tturns = tturns.rename(columns = {'ITEM_ID' : 'inum', 'TOTALPRICE' : 'turn'})
for i in tqdm(unic_arc):
b = df[df['ITEM_ID'] == i]['ORDER_CODE'].values
for t in b:
a = df[df['ORDER_CODE'] == t]['ID'].values
if i in a:
for arc in a:
if int(arc) not in hit_list:
if arc != i:
if arc in data_arc[i]:
data_arc[i][arc]+=1
else:
data_arc[i][arc] = 1
dd = data_arc[i]
tmp = pd.DataFrame(columns = ['inum', 'freq'])
tmp['inum'] = data_arc[i].keys()
tmp['freq'] = data_arc[i].values()
tmp['inum'] = tmp['inum'].astype(str)
tturns['inum'] = tturns['inum'].astype(str)
tmp = pd.merge(tmp, tturns, on = 'inum', how = 'inner')
tmp = tmp.sort_values(by = ['freq', 'turn'], ascending = False)
if len(tmp['inum'].values) > 14:
inums = str(tmp['inum'].values[0:15]).replace("\n", "").replace(' ', ',').replace('\'', '')
else:
inums = str(tmp['inum'].values).replace("\n", "").replace(' ', ',').replace('\'', '')
result = res.append({'inum' : i, 'recs' : inums}, ignore_index = True)
return(result)
I try to add merge 1for addint ITEM_NAME in Func on any iteration, but it so long. My dataset have about 10kk rows
I need add to my output one more column with list of 'ITEM_NAME' of 'sold together' list items. And calc it fast?
UPD:
Here's what is needed:
item_id
list_of items
list_of_names
sum
id_01
[id, id, id, id]
[name, name....]
num
Where list_of items - 'list of most common' items, which were purchased with 'item_id'

This might do it:
import pandas as pd
df = pd.DataFrame( {
'ORDER_CODE':['123','321','123','123','321','555'],
'ITEM_ID':[1,2,5,5,4,6],
'ITEM_NAME':['name1','name2','name3','name4','name5','name6'],
'TOTALPRICE':[10,20,50,50,40,60]}
)
result = df.groupby("ORDER_CODE").agg({"ITEM_ID":list, "ITEM_NAME":list, "TOTALPRICE":"sum"})
Further good answer how to create a list in a group by aggregation:

Related

Convert column containing '.' in dataframe to dictonary

I want to convert df to df1
df = pd.DataFrame({'A': [1], 'in.1': [8977], 'in.2': [8977], 'B': [
{
"C.i": 87387460,
"C.j":233
}]})
df1 = pd.DataFrame({'A': [1], 'in':{'1': [8977], '2': [8977]}, 'B': [
{"C":{
"i": 87387460,
"j":233}
}]})
I tried using recursive function but no luck.
My Code:
def convert_df(df):
if df.shape[0] == 0:
return []
elif df.shape[0] == 1 and df.shape[1] == 1:
return df.iloc[0, 0]
elif df.shape[1] == 1:
return [convert_df(pd.DataFrame(val)) for val in df[df.columns[0]].tolist()]
else:
return [{col_name: convert_df(pd.DataFrame(val)) for col_name, val in row.to_dict().items()} for i, row in df.iterrows()]
To convert a column in a DataFrame that contains dots . in its name to a dictionary, you can do the following:
def convert_to_dict(df, col_name):
out = {}
for row in df[col_name].tolist():
for key, val in row.items():
sub_keys = key.split('.')
d = out
for sub_key in sub_keys[:-1]:
if sub_key not in d:
d[sub_key] = {}
d = d[sub_key]
d[sub_keys[-1]] = val
return out
col_name = 'B'
df1 = convert_to_dict(df, col_name)
df1 = pd.DataFrame({col_name: [df1]})
The expected output is ambiguous, but I assume you might want to use to_dict:
cols = list(df.filter(like='in'))
df1 = (df.drop(columns=cols)
.assign(**{'in': pd.Series(df[cols].to_dict('index'))})
)
Output:
A B in
0 1 {'C.i': 87387460, 'C.j': 233} {'in.1': 8977, 'in.2': 8977}

Python generated Excel file only shows one row of data vs multiple rows

I am trying to write the results from the loop into an Excel file (keys = column names) and (values = rows data). This code generates the file for me, but it only prints one row of data in the file. How can i make it append the other rows to the file?
import pandas as pd
p = (('BusinessName', 'CustomerNameToSearch'), ('PageSize', '2'), ('CountryCode', 'CA'))
prepare_link = requests.get('https://api.myapiloopuplink?', auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
test = requests.get(prepare_link.url, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
data = json.loads(test.text)
CustomerIdList = []
for customer in data['Data']:
BusinessID = customer['BusinessId']
BusinessName = customer['BusinessName']
CustomerIdList.append(str(customer['BusinessId']))
for i in CustomerIdList:
links2 = ("https://api.myapiloopuplink/"+i+"/History?count=1")
test2 = requests.get(links2, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'))
data2 = json.loads(test2.text)
start_row = 0
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)
start_row = start_row + len(df) + 1
This is the output i currently get
This is the output i am trying to get
In the loop i get the right results when i print (it shows multiple rows)
print(myDict)
I think the problem is here:
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x) #problem
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)#problem
start_row = start_row + len(df) + 1
You are creating an excel file in every loop. How about create an excel file after the loop completes. like this:
datas=[]
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
datas.append([myDict])
start_row = start_row + len(df) + 1
df = pd.DataFrame(datas)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)

Create an organized DF from a List of mixed type items (Python)

I have a list of items in a 'variable:value' format, but the same 'variable' can appear multiple times. The only thing I know is that all values that follow the 'ID' category belong to the same 'ID', so I know how many rows I need (3 in this example).
I need to create a dataframe from this list. The problem I am encountering is that I cannot add a string value to my DF ('could not convert str to float'). I am not sure how to proceed.
mylist = ['ID:1', 'Date: Oct 2', 'B:88', 'C:noun', 'D:44', 'ID:2', 'B:55', 'C:noun', 'D:45', 'ID:3',
'Date:Sept 5', 'B:55', 'C:verb']
categories = []
for i in mylist:
var = i.split(":")
categories.append(var[0])
variables = list(set(categories))
df = np.empty((3,len(variables)))
df = pd.DataFrame(df)
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
tracker = -1
for j in variables:
tracker = tracker + 1
if j == category:
float(value)
df[counter, tracker] = value
if category == "ID":
counter = counter + 1
float(value)
df[counter, 0] = value
In addition, I've tried converting the items in the list to dictionary, but I am not sure if that's the best way to achieve my goal:
df = np.empty((3,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mydict = {category:value}
if category == "ID":
counter = counter + 1
df[counter] = pd.DataFrame.from_dict(mydict)
else:
df[counter] = pd.DataFrame.from_dict(mydict)
Edit:
I solved it. Code below:
df = np.empty((0,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = 0
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mynewdef = {category:value}
counter = counter + 1
if counter == len(mylist):
df = df.append(mydict, ignore_index = True)
df = df.iloc[1:]
elif category == 'ID':
df = df.append(mydict, ignore_index = True)
mydict = {}
mydict.update(mynewdef)
else:
mydict.update(mynewdef)
Perhaps this works
df = pd.DataFrame([e.split(':') for e in my_list],
columns=['key', 'value'])
df = df.pivot(columns='key', values='value') #not tested

Handling exception with list out of range

I'm trying to extract all WC 2019 players batting stats, query got stuck with an error "list index out of range" at the player: http://www.espncricinfo.com/india/content/player/398438.html
How can I handle exception or PASS to get complete team player stats?
url2 = 'http://stats.espncricinfo.com/ci/engine/player/' + \
str(player_id) + \
'.htmlclass=2;template=results;type=batting;view=innings'
html = urllib.request.urlopen(url2, context=ctx).read()
temp_data = OrderedDict()
list_of_dict = []
bs = BeautifulSoup(html, 'lxml')
table_body = bs.find_all('tbody')
rows = table_body[1].find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [x.text.strip() for x in cols]
temp_data = OrderedDict()
for i in range(len(cols)):
temp_data["Runs"] = cols[0]
temp_data["Mins"] = cols[1]
temp_data["BF"] = cols[2]
temp_data["fours"] = cols[3]
temp_data["sixs"] = cols[4]
temp_data["SR"] = cols[5]
temp_data["POS"] = cols[6]
temp_data["Dismissal"] = cols[7]
temp_data["Inns"] = cols[8]
temp_data["Opposition"] = cols[10]
temp_data["Ground"] = cols[11]
temp_data["Date"] = cols[12]
temp_data["player"] = player
temp_data["playerid"] = player_id
list_of_dict.append(temp_data)
df = pd.DataFrame(list_of_dict)
df
df.to_sql("dummy", con, if_exists="append")
I'd like to extract all WC squad wise player stats.

Script in python/pandas works but doesn't work when placed in side a function

I have this script I'm running to try to create a dataframe to summarize some statistics:
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in month:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
It returns exactly what I want to see. But when I place it inside a function I get the following error:
AssertionError: 5 columns passed, passed data had 1 columns
Here is the code inside the function:
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
You have a problem with the last line of the for loop in the function. this_df is being defined in every iteration of the loop.
The corrected code is below.
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
Base on my understanding , you do not need the for loop here
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
df=pd.concat(month,keys=month_str)
df=df.mask(df==0|df==99999)
df.groupby(level=0).mean().T

Categories

Resources