I want to convert df to df1
df = pd.DataFrame({'A': [1], 'in.1': [8977], 'in.2': [8977], 'B': [
{
"C.i": 87387460,
"C.j":233
}]})
df1 = pd.DataFrame({'A': [1], 'in':{'1': [8977], '2': [8977]}, 'B': [
{"C":{
"i": 87387460,
"j":233}
}]})
I tried using recursive function but no luck.
My Code:
def convert_df(df):
if df.shape[0] == 0:
return []
elif df.shape[0] == 1 and df.shape[1] == 1:
return df.iloc[0, 0]
elif df.shape[1] == 1:
return [convert_df(pd.DataFrame(val)) for val in df[df.columns[0]].tolist()]
else:
return [{col_name: convert_df(pd.DataFrame(val)) for col_name, val in row.to_dict().items()} for i, row in df.iterrows()]
To convert a column in a DataFrame that contains dots . in its name to a dictionary, you can do the following:
def convert_to_dict(df, col_name):
out = {}
for row in df[col_name].tolist():
for key, val in row.items():
sub_keys = key.split('.')
d = out
for sub_key in sub_keys[:-1]:
if sub_key not in d:
d[sub_key] = {}
d = d[sub_key]
d[sub_keys[-1]] = val
return out
col_name = 'B'
df1 = convert_to_dict(df, col_name)
df1 = pd.DataFrame({col_name: [df1]})
The expected output is ambiguous, but I assume you might want to use to_dict:
cols = list(df.filter(like='in'))
df1 = (df.drop(columns=cols)
.assign(**{'in': pd.Series(df[cols].to_dict('index'))})
)
Output:
A B in
0 1 {'C.i': 87387460, 'C.j': 233} {'in.1': 8977, 'in.2': 8977}
Related
I have Dataset like this:
ORDER_CODE
ITEM_ID
ITEM_NAME
TOTALPRICE
123
id1
name1
345
321
id2
name2
678
and Function for calculation which items was sold together. Which ones was most popular or more expensive
out:
ITEM_ID
sold together
id1
[ id33, id23, id12 ]
id2
[ id56, id663 ]
I using this Func:
def freq(df):
hit_list = [list of ID's]
result = pd.DataFrame(columns = ['ITEM_ID', 'sold together'])
unic_arc = df['ITEM_ID'].unique()
unic_num = df['ORDER_CODE'].unique()
data_arc ={}
data_num={}
for i in unic_arc:
data_arc[i] = {}
tturns = response_ur[['ITEM_ID', 'TOTALPRICE']].groupby(by = 'ITEM_ID', as_index = False).sum()
tturns = tturns.rename(columns = {'ITEM_ID' : 'inum', 'TOTALPRICE' : 'turn'})
for i in tqdm(unic_arc):
b = df[df['ITEM_ID'] == i]['ORDER_CODE'].values
for t in b:
a = df[df['ORDER_CODE'] == t]['ID'].values
if i in a:
for arc in a:
if int(arc) not in hit_list:
if arc != i:
if arc in data_arc[i]:
data_arc[i][arc]+=1
else:
data_arc[i][arc] = 1
dd = data_arc[i]
tmp = pd.DataFrame(columns = ['inum', 'freq'])
tmp['inum'] = data_arc[i].keys()
tmp['freq'] = data_arc[i].values()
tmp['inum'] = tmp['inum'].astype(str)
tturns['inum'] = tturns['inum'].astype(str)
tmp = pd.merge(tmp, tturns, on = 'inum', how = 'inner')
tmp = tmp.sort_values(by = ['freq', 'turn'], ascending = False)
if len(tmp['inum'].values) > 14:
inums = str(tmp['inum'].values[0:15]).replace("\n", "").replace(' ', ',').replace('\'', '')
else:
inums = str(tmp['inum'].values).replace("\n", "").replace(' ', ',').replace('\'', '')
result = res.append({'inum' : i, 'recs' : inums}, ignore_index = True)
return(result)
I try to add merge 1for addint ITEM_NAME in Func on any iteration, but it so long. My dataset have about 10kk rows
I need add to my output one more column with list of 'ITEM_NAME' of 'sold together' list items. And calc it fast?
UPD:
Here's what is needed:
item_id
list_of items
list_of_names
sum
id_01
[id, id, id, id]
[name, name....]
num
Where list_of items - 'list of most common' items, which were purchased with 'item_id'
This might do it:
import pandas as pd
df = pd.DataFrame( {
'ORDER_CODE':['123','321','123','123','321','555'],
'ITEM_ID':[1,2,5,5,4,6],
'ITEM_NAME':['name1','name2','name3','name4','name5','name6'],
'TOTALPRICE':[10,20,50,50,40,60]}
)
result = df.groupby("ORDER_CODE").agg({"ITEM_ID":list, "ITEM_NAME":list, "TOTALPRICE":"sum"})
Further good answer how to create a list in a group by aggregation:
I am looking for a way to convert structure:
for ... in dataframe:
while ...:
if ...:
do smth
if ...:
do smth
to
dataframe.apply(lambda ...: ...)
Here is example of function with for/while loop:
d_test = {
'name' : ['South Beach', 'Dog', 'Bird', 'Ant', 'Big Dog', 'Beach', 'Dear', 'Cat', 'Fish', 'Dry Fish'],
'cluster_number' : [1, 2, 3, 3, 2, 1, 4, 2, 2, 2]
}
df_test = pd.DataFrame(d_test)
from rapidfuzz import fuzz
df_test = df_test.sort_values(['cluster_number', 'name'])
df_test.reset_index(drop=True, inplace=True)
df_test['id'] = 0
def loop_in_cluster(index, row, df_test, index_, row_, is_i_used, i):
while index_ < len(df_test) and df_test.loc[index, 'cluster_number'] == df_test.loc[index_, 'cluster_number'] and df_test.loc[index_, 'id'] == 0:
if row['name'] == df_test.loc[index_, 'name'] or fuzz.ratio(row['name'], df_test.loc[index_, 'name']) > 50:
df_test.loc[index_,'id'] = i
is_i_used = True
index_ += 1
return df_test, is_i_used
i = 1
is_i_used = False
for index, row in df_test.iterrows():
row_ = row
index_ = index
df_test, is_i_used = loop_in_cluster(index, row, df_test, index_, row_, is_i_used, i)
if is_i_used == True:
i += 1
is_i_used = False
And here is my attempt to use dataframe.apply() method:
i = 1
df_test.apply(lambda row: loop_in_cluster(i=i+1, index=row.name, row=row, df_test=df_test, index_=index, row_ = row, is_i_used=False) if is_i_used==True else loop_in_cluster(i=i, index=row.name, row=row, df_test=df_test, index_= index, row_=row, is_i_used=True), axis=1)
but I am getting: StopIteration: error in output. I tired other methods such as pandas groupby.GroupBy method but it seems that apply method is more closer to what I am looking for.
If in the input data first_name and last_name is null then remove those records from df and put into new dataframe error df with extra columns
"rejeted_reason": ['first_name,'last_name] is empty.
Input Data:
customer_number|first_name|middle_name|last_name|gender
90617174||Aliari||Male
92154246|Roberto||Intriago Nunez|Male
07605348|E|A|Christodoulou|Male
80284242|Ritchie|O||Male
Error File :
customer_number|first_name|middle_name|last_name|gender|rejection_reason
90617174||Aliari||Male|["first_name","last_name] is empty
80284242|Ritchie|O||Male|["last_name"] is empty
Output File:
customer_number|first_name|middle_name|last_name|gender
92154246|Roberto||Intriago Nunez|Male
07605348|E|A|Christodoulou|Male
Code Tried:
newList = ['first_name','last_name']
for index,row in df.iterrows():
error_col = []
temp_dic = []
for col in newList:
if (row[col] == '' or pd.isna(row[col]) or pd.isnull(row[col])):
error_col.append(col)
row["rejection_reason"] = col + ' is empty'
df.drop(index, inplace=True)
temp_dic.append(row)
print("temp dic:", temp_dic)
Error
raise KeyError(f"{labels[mask]} not found in axis")
KeyError: '[0] not found in axis'
Since you drop the row each time you see empty column in that row, you are removing a row more than once. So after removing the row for the first time, you see KeyError. This should work:
newList = ['first_name','last_name']
temp_dic = []
for index,row in df.iterrows():
error_col = []
for col in newList:
if (row[col] == '' or pd.isna(row[col]) or pd.isnull(row[col])):
error_col.append(col)
if len(error_col) > 0:
df.drop(index, inplace=True)
temp_dic.append(row)
row["rejection_reason"] = str(error_col) + ' is empty'
print("temp dic:", temp_dic)
But I recommend doing this instead of processing each row:
newList = ['first_name','last_name']
def check_columns(row):
error_col = []
for col in newList:
if (row[col] == '' or pd.isna(row[col]) or pd.isnull(row[col])):
error_col.append(col)
if len(error_col) > 0:
return str(error_col) + ' is empty'
else:
return ''
df['rejection_reason'] = df.apply(check_columns, axis=1)
df_error = df[df['rejection_reason'] != '']
df_output = df[df['rejection_reason'] == '']
given the input datframe df, in order to filter out the rows where 'last_name' or 'first_name' are null the following will work:
filter_df = df[~(df['first_name'].isnull() | df['last_name'].isnull())]
In order to create the error dataframe containing rows that have 'last_name' or 'first_name' null and the corresponding error the following code will work:
error_df = df[(df['first_name'].isnull() | df['last_name'].isnull())].copy()
error_df.loc[error_df['first_name'].isnull(), "rejeted_reason"] = "['first_name'] is empty."
error_df.loc[error_df['last_name'].isnull(), "rejeted_reason"] = "['last_name'] is empty."
error_df.loc[(error_df['first_name'].isnull() & error_df['last_name'].isnull()), "rejeted_reason"] = "['first_name', 'last_name'] is empty."
Output of filter_df given the above input:
Output of error_df given the above input:
I am trying to write the results from the loop into an Excel file (keys = column names) and (values = rows data). This code generates the file for me, but it only prints one row of data in the file. How can i make it append the other rows to the file?
import pandas as pd
p = (('BusinessName', 'CustomerNameToSearch'), ('PageSize', '2'), ('CountryCode', 'CA'))
prepare_link = requests.get('https://api.myapiloopuplink?', auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
test = requests.get(prepare_link.url, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'), params=p)
data = json.loads(test.text)
CustomerIdList = []
for customer in data['Data']:
BusinessID = customer['BusinessId']
BusinessName = customer['BusinessName']
CustomerIdList.append(str(customer['BusinessId']))
for i in CustomerIdList:
links2 = ("https://api.myapiloopuplink/"+i+"/History?count=1")
test2 = requests.get(links2, auth=BearerAuth('PMay4TY5K577b76154i97yC9DlbPytqd'))
data2 = json.loads(test2.text)
start_row = 0
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)
start_row = start_row + len(df) + 1
This is the output i currently get
This is the output i am trying to get
In the loop i get the right results when i print (it shows multiple rows)
print(myDict)
I think the problem is here:
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
x = [myDict]
df = pd.DataFrame(x) #problem
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)#problem
start_row = start_row + len(df) + 1
You are creating an excel file in every loop. How about create an excel file after the loop completes. like this:
datas=[]
for extradetails in data2['Data']:
myDict = {}
myDict["BusinessId"] = customer['BusinessId']
myDict["BusinessName"] = customer['BusinessName']
myDict["Year"] = extradetails['Year']
myDict["Rate"] = extradetails['Rate']
print(myDict)
k = list(myDict.keys())
v = list(myDict.values())
#print(k)
#print(v)
datas.append([myDict])
start_row = start_row + len(df) + 1
df = pd.DataFrame(datas)
df.to_excel ('locationandnameoffile.xlsx', sheet_name = 'sheet1', index = False, startrow=start_row)
I have a list of items in a 'variable:value' format, but the same 'variable' can appear multiple times. The only thing I know is that all values that follow the 'ID' category belong to the same 'ID', so I know how many rows I need (3 in this example).
I need to create a dataframe from this list. The problem I am encountering is that I cannot add a string value to my DF ('could not convert str to float'). I am not sure how to proceed.
mylist = ['ID:1', 'Date: Oct 2', 'B:88', 'C:noun', 'D:44', 'ID:2', 'B:55', 'C:noun', 'D:45', 'ID:3',
'Date:Sept 5', 'B:55', 'C:verb']
categories = []
for i in mylist:
var = i.split(":")
categories.append(var[0])
variables = list(set(categories))
df = np.empty((3,len(variables)))
df = pd.DataFrame(df)
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
tracker = -1
for j in variables:
tracker = tracker + 1
if j == category:
float(value)
df[counter, tracker] = value
if category == "ID":
counter = counter + 1
float(value)
df[counter, 0] = value
In addition, I've tried converting the items in the list to dictionary, but I am not sure if that's the best way to achieve my goal:
df = np.empty((3,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = -1
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mydict = {category:value}
if category == "ID":
counter = counter + 1
df[counter] = pd.DataFrame.from_dict(mydict)
else:
df[counter] = pd.DataFrame.from_dict(mydict)
Edit:
I solved it. Code below:
df = np.empty((0,len(variables)))
df = pd.DataFrame(df, columns = variables)
mydict = {}
counter = 0
for i in mylist:
item = i.split(":")
category = item[0]
value = item[1]
mynewdef = {category:value}
counter = counter + 1
if counter == len(mylist):
df = df.append(mydict, ignore_index = True)
df = df.iloc[1:]
elif category == 'ID':
df = df.append(mydict, ignore_index = True)
mydict = {}
mydict.update(mynewdef)
else:
mydict.update(mynewdef)
Perhaps this works
df = pd.DataFrame([e.split(':') for e in my_list],
columns=['key', 'value'])
df = df.pivot(columns='key', values='value') #not tested