Convert column containing '.' in dataframe to dictonary

Convert column containing '.' in dataframe to dictonary - python

I want to convert df to df1
df = pd.DataFrame({'A': [1], 'in.1': [8977], 'in.2': [8977], 'B': [
{
"C.i": 87387460,
"C.j":233
}]})
df1 = pd.DataFrame({'A': [1], 'in':{'1': [8977], '2': [8977]}, 'B': [
{"C":{
"i": 87387460,
"j":233}
}]})
I tried using recursive function but no luck.
My Code:
def convert_df(df):
if df.shape[0] == 0:
return []
elif df.shape[0] == 1 and df.shape[1] == 1:
return df.iloc[0, 0]
elif df.shape[1] == 1:
return [convert_df(pd.DataFrame(val)) for val in df[df.columns[0]].tolist()]
else:
return [{col_name: convert_df(pd.DataFrame(val)) for col_name, val in row.to_dict().items()} for i, row in df.iterrows()]

To convert a column in a DataFrame that contains dots . in its name to a dictionary, you can do the following:
def convert_to_dict(df, col_name):
out = {}
for row in df[col_name].tolist():
for key, val in row.items():
sub_keys = key.split('.')
d = out
for sub_key in sub_keys[:-1]:
if sub_key not in d:
d[sub_key] = {}
d = d[sub_key]
d[sub_keys[-1]] = val
return out
col_name = 'B'
df1 = convert_to_dict(df, col_name)
df1 = pd.DataFrame({col_name: [df1]})

The expected output is ambiguous, but I assume you might want to use to_dict:
cols = list(df.filter(like='in'))
df1 = (df.drop(columns=cols)
.assign(**{'in': pd.Series(df[cols].to_dict('index'))})
)
Output:
A B in
0 1 {'C.i': 87387460, 'C.j': 233} {'in.1': 8977, 'in.2': 8977}

Related

Items combination and frequency count in Pandas

I have Dataset like this:
ORDER_CODE
ITEM_ID
ITEM_NAME
TOTALPRICE
123
id1
name1
345
321
id2
name2
678
and Function for calculation which items was sold together. Which ones was most popular or more expensive
out:
ITEM_ID
sold together
id1
[ id33, id23, id12 ]
id2
[ id56, id663 ]
I using this Func:
def freq(df):
hit_list = [list of ID's]
result = pd.DataFrame(columns = ['ITEM_ID', 'sold together'])
unic_arc = df['ITEM_ID'].unique()
unic_num = df['ORDER_CODE'].unique()
data_arc ={}
data_num={}
for i in unic_arc:
data_arc[i] = {}
tturns = response_ur[['ITEM_ID', 'TOTALPRICE']].groupby(by = 'ITEM_ID', as_index = False).sum()
tturns = tturns.rename(columns = {'ITEM_ID' : 'inum', 'TOTALPRICE' : 'turn'})
for i in tqdm(unic_arc):
b = df[df['ITEM_ID'] == i]['ORDER_CODE'].values
for t in b:
a = df[df['ORDER_CODE'] == t]['ID'].values
if i in a:
for arc in a:
if int(arc) not in hit_list:
if arc != i:
if arc in data_arc[i]:
data_arc[i][arc]+=1
else:
data_arc[i][arc] = 1
dd = data_arc[i]
tmp = pd.DataFrame(columns = ['inum', 'freq'])
tmp['inum'] = data_arc[i].keys()
tmp['freq'] = data_arc[i].values()
tmp['inum'] = tmp['inum'].astype(str)
tturns['inum'] = tturns['inum'].astype(str)
tmp = pd.merge(tmp, tturns, on = 'inum', how = 'inner')
tmp = tmp.sort_values(by = ['freq', 'turn'], ascending = False)
if len(tmp['inum'].values) > 14:
inums = str(tmp['inum'].values[0:15]).replace("\n", "").replace(' ', ',').replace('\'', '')
else:
inums = str(tmp['inum'].values).replace("\n", "").replace(' ', ',').replace('\'', '')
result = res.append({'inum' : i, 'recs' : inums}, ignore_index = True)
return(result)
I try to add merge 1for addint ITEM_NAME in Func on any iteration, but it so long. My dataset have about 10kk rows
I need add to my output one more column with list of 'ITEM_NAME' of 'sold together' list items. And calc it fast?
UPD:
Here's what is needed:
item_id
list_of items
list_of_names
sum
id_01
[id, id, id, id]
[name, name....]
num
Where list_of items - 'list of most common' items, which were purchased with 'item_id'

This might do it:
import pandas as pd
df = pd.DataFrame( {
'ORDER_CODE':['123','321','123','123','321','555'],
'ITEM_ID':[1,2,5,5,4,6],
'ITEM_NAME':['name1','name2','name3','name4','name5','name6'],
'TOTALPRICE':[10,20,50,50,40,60]}
)
result = df.groupby("ORDER_CODE").agg({"ITEM_ID":list, "ITEM_NAME":list, "TOTALPRICE":"sum"})
Further good answer how to create a list in a group by aggregation:

Convert function with for/while loop into Pandas dataframe.apply(lambda ...: ...) method if there is a change in each iteration

I am looking for a way to convert structure:
for ... in dataframe:
while ...:
if ...:
do smth
if ...:
do smth
to
dataframe.apply(lambda ...: ...)
Here is example of function with for/while loop:
d_test = {
'name' : ['South Beach', 'Dog', 'Bird', 'Ant', 'Big Dog', 'Beach', 'Dear', 'Cat', 'Fish', 'Dry Fish'],
'cluster_number' : [1, 2, 3, 3, 2, 1, 4, 2, 2, 2]
}
df_test = pd.DataFrame(d_test)
from rapidfuzz import fuzz
df_test = df_test.sort_values(['cluster_number', 'name'])
df_test.reset_index(drop=True, inplace=True)
df_test['id'] = 0
def loop_in_cluster(index, row, df_test, index_, row_, is_i_used, i):
while index_ < len(df_test) and df_test.loc[index, 'cluster_number'] == df_test.loc[index_, 'cluster_number'] and df_test.loc[index_, 'id'] == 0:
if row['name'] == df_test.loc[index_, 'name'] or fuzz.ratio(row['name'], df_test.loc[index_, 'name']) > 50:
df_test.loc[index_,'id'] = i
is_i_used = True
index_ += 1
return df_test, is_i_used
i = 1
is_i_used = False
for index, row in df_test.iterrows():
row_ = row
index_ = index
df_test, is_i_used = loop_in_cluster(index, row, df_test, index_, row_, is_i_used, i)
if is_i_used == True:
i += 1
is_i_used = False
And here is my attempt to use dataframe.apply() method:
i = 1
df_test.apply(lambda row: loop_in_cluster(i=i+1, index=row.name, row=row, df_test=df_test, index_=index, row_ = row, is_i_used=False) if is_i_used==True else loop_in_cluster(i=i, index=row.name, row=row, df_test=df_test, index_= index, row_=row, is_i_used=True), axis=1)
but I am getting: StopIteration: error in output. I tired other methods such as pandas groupby.GroupBy method but it seems that apply method is more closer to what I am looking for.

Check the null for columns first_name and last_name, if found remove those rows from df and put into new dataframe error df

If in the input data first_name and last_name is null then remove those records from df and put into new dataframe error df with extra columns
"rejeted_reason": ['first_name,'last_name] is empty.
Input Data:
customer_number|first_name|middle_name|last_name|gender
90617174||Aliari||Male
92154246|Roberto||Intriago Nunez|Male
07605348|E|A|Christodoulou|Male
80284242|Ritchie|O||Male
Error File :
customer_number|first_name|middle_name|last_name|gender|rejection_reason
90617174||Aliari||Male|["first_name","last_name] is empty
80284242|Ritchie|O||Male|["last_name"] is empty
Output File:
customer_number|first_name|middle_name|last_name|gender
92154246|Roberto||Intriago Nunez|Male
07605348|E|A|Christodoulou|Male
Code Tried:
newList = ['first_name','last_name']
for index,row in df.iterrows():
error_col = []
temp_dic = []
for col in newList:
if (row[col] == '' or pd.isna(row[col]) or pd.isnull(row[col])):
error_col.append(col)
row["rejection_reason"] = col + ' is empty'
df.drop(index, inplace=True)
temp_dic.append(row)
print("temp dic:", temp_dic)
Error
raise KeyError(f"{labels[mask]} not found in axis")
KeyError: '[0] not found in axis'

Since you drop the row each time you see empty column in that row, you are removing a row more than once. So after removing the row for the first time, you see KeyError. This should work:
newList = ['first_name','last_name']
temp_dic = []
for index,row in df.iterrows():
error_col = []
for col in newList:
if (row[col] == '' or pd.isna(row[col]) or pd.isnull(row[col])):
error_col.append(col)
if len(error_col) > 0:
df.drop(index, inplace=True)
temp_dic.append(row)
row["rejection_reason"] = str(error_col) + ' is empty'
print("temp dic:", temp_dic)
But I recommend doing this instead of processing each row:
newList = ['first_name','last_name']
def check_columns(row):
error_col = []
for col in newList:
if (row[col] == '' or pd.isna(row[col]) or pd.isnull(row[col])):
error_col.append(col)
if len(error_col) > 0:
return str(error_col) + ' is empty'
else:
return ''
df['rejection_reason'] = df.apply(check_columns, axis=1)
df_error = df[df['rejection_reason'] != '']
df_output = df[df['rejection_reason'] == '']

given the input datframe df, in order to filter out the rows where 'last_name' or 'first_name' are null the following will work:
filter_df = df[~(df['first_name'].isnull() | df['last_name'].isnull())]
In order to create the error dataframe containing rows that have 'last_name' or 'first_name' null and the corresponding error the following code will work:
error_df = df[(df['first_name'].isnull() | df['last_name'].isnull())].copy()
error_df.loc[error_df['first_name'].isnull(), "rejeted_reason"] = "['first_name'] is empty."
error_df.loc[error_df['last_name'].isnull(), "rejeted_reason"] = "['last_name'] is empty."
error_df.loc[(error_df['first_name'].isnull() & error_df['last_name'].isnull()), "rejeted_reason"] = "['first_name', 'last_name'] is empty."
Output of filter_df given the above input:
Output of error_df given the above input:

Python generated Excel file only shows one row of data vs multiple rows

I am trying to write the results from the loop into an Excel file (keys = column names) and (values = rows data). This code generates the file for me, but it only prints one row of data in the file. How can i make it append the other rows to the file?
import pandas as pd
p = (('BusinessName', 'CustomerNameToSearch'), ('PageSize', '2'), ('CountryCode', 'CA'))
prepare_link = requests.get('https://api.myapiloopuplink?', auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
test = requests.get(prepare_link.url, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
data = json.loads(test.text)
CustomerIdList = []
for customer in data['Data']:
BusinessID = customer['BusinessId']
BusinessName = customer['BusinessName']
CustomerIdList.append(str(customer['BusinessId']))
for i in CustomerIdList:
links2 = ("https://api.myapiloopuplink/"+i+"/History?count=1")
test2 = requests.get(links2, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'))
data2 = json.loads(test2.text)
start_row = 0
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)
start_row = start_row + len(df) + 1
This is the output i currently get
This is the output i am trying to get
In the loop i get the right results when i print (it shows multiple rows)
print(myDict)

I think the problem is here:
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x) #problem
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)#problem
start_row = start_row + len(df) + 1
You are creating an excel file in every loop. How about create an excel file after the loop completes. like this:
datas=[]
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
datas.append([myDict])
start_row = start_row + len(df) + 1
df = pd.DataFrame(datas)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)

Create an organized DF from a List of mixed type items (Python)

I have a list of items in a 'variable:value' format, but the same 'variable' can appear multiple times. The only thing I know is that all values that follow the 'ID' category belong to the same 'ID', so I know how many rows I need (3 in this example).
I need to create a dataframe from this list. The problem I am encountering is that I cannot add a string value to my DF ('could not convert str to float'). I am not sure how to proceed.
mylist = ['ID:1', 'Date: Oct 2', 'B:88', 'C:noun', 'D:44', 'ID:2', 'B:55', 'C:noun', 'D:45', 'ID:3',
'Date:Sept 5', 'B:55', 'C:verb']
categories = []
for i in mylist:
var = i.split(":")
categories.append(var[0])
variables = list(set(categories))
df = np.empty((3,len(variables)))
df = pd.DataFrame(df)
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
tracker = -1
for j in variables:
tracker = tracker + 1
if j == category:
float(value)
df[counter, tracker] = value
if category == "ID":
counter = counter + 1
float(value)
df[counter, 0] = value
In addition, I've tried converting the items in the list to dictionary, but I am not sure if that's the best way to achieve my goal:
df = np.empty((3,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mydict = {category:value}
if category == "ID":
counter = counter + 1
df[counter] = pd.DataFrame.from_dict(mydict)
else:
df[counter] = pd.DataFrame.from_dict(mydict)
Edit:
I solved it. Code below:
df = np.empty((0,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = 0
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mynewdef = {category:value}
counter = counter + 1
if counter == len(mylist):
df = df.append(mydict, ignore_index = True)
df = df.iloc[1:]
elif category == 'ID':
df = df.append(mydict, ignore_index = True)
mydict = {}
mydict.update(mynewdef)
else:
mydict.update(mynewdef)

Perhaps this works
df = pd.DataFrame([e.split(':') for e in my_list],
columns=['key', 'value'])
df = df.pivot(columns='key', values='value') #not tested

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Convert column containing '.' in dataframe to dictonary - python

The expected output is ambiguous, but I assume you might want to use to_dict: cols = list(df.filter(like='in')) df1 = (df.drop(columns=cols) .assign(**{'in': pd.Series(df[cols].to_dict('index'))}) ) Output: A B in 0 1 {'C.i': 87387460, 'C.j': 233} {'in.1': 8977, 'in.2': 8977}

Related

Items combination and frequency count in Pandas

Convert function with for/while loop into Pandas dataframe.apply(lambda ...: ...) method if there is a change in each iteration

Check the null for columns first_name and last_name, if found remove those rows from df and put into new dataframe error df

Python generated Excel file only shows one row of data vs multiple rows

Create an organized DF from a List of mixed type items (Python)

Categories

Resources