how to do complex calculations in pandas dataframe - python

sample dataframe:
df = pd.DataFrame({'sales': ['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06'],
'2020-01': [24,42,18,68,24,30],
'2020-02': [24,42,18,68,24,30],
'2020-03': [64,24,70,70,88,57],
'2020-04': [22,11,44,3,5,78],
'2020-05': [11,35,74,12,69,51]}
I want to find below df['L2']
I studied pandas rolling,groupby,etcs, cannot solve it.
please read L2 formula & givee me a opinion
L2 formula
L2(Jan-20) = 24
-------------------
sales 2020-01
0 2020-01 24
-------------------
L2(Feb-20) = 132 (sum of below matrix 2x2)
sales 2020-01 2020-02
0 2020-01 24 24
1 2020-02 42 42
-------------------
L2(Mar-20) = 154 (sum of matrix 2x2)
sales 2020-02 2020-03
0 2020-02 42 24
1 2020-03 18 70
-------------------
L2(Apr-20) = 187 (sum of below maxtrix 2x2)
sales 2020-03 2020-04
0 2020-03 70 44
1 2020-04 70 3
output
Unnamed: 0 sales Jan-20 Feb-20 Mar-20 Apr-20 May-20 L2 L3
0 0 Jan-20 24 24 64 22 11 24 24
1 1 Feb-20 42 42 24 11 35 132 132
2 2 Mar-20 18 18 70 44 74 154 326
3 3 Apr-20 68 68 70 3 12 187 350
4 4 May-20 24 24 88 5 69 89 545
5 5 Jun-20 30 30 57 78 51 203 433

Values=f.values[:,1:]
L2=[]
RANGE=Values.shape[0]
for a in range(RANGE):
if a==0:
result=Values[a,a]
else:
if Values[a-1:a+1,a-1:a+1].shape==(2,1):
result=np.sum(Values[a-1:a+1,a-2:a])
else:
result=np.sum(Values[a-1:a+1,a-1:a+1])
L2.append(result)
print(L2)
L2 output:-->[24, 132, 154, 187, 89, 203]
f["L2"]=L2
f:

import pandas as pd
import numpy as np
# make a dataset
df = pd.DataFrame({'sales': ['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06'],
'2020-01': [24,42,18,68,24,30],
'2020-02': [24,42,18,68,24,30],
'2020-03': [64,24,70,70,88,57],
'2020-04': [22,11,44,3,5,78],
'2020-05': [11,35,74,12,69,51]})
print(df)
# datawork(L2)
for i in range(0,df.shape[0]):
if i==0:
df.loc[i,'L2']=df.loc[i,'2020-01']
else:
if i!=df.shape[0]-1:
df.loc[i,'L2']=df.iloc[i-1:i+1,i:i+2].sum().sum()
if i==df.shape[0]-1:
df.loc[i,'L2']=df.iloc[i-1:i+1,i-1:i+1].sum().sum()
print(df)
# sales 2020-01 2020-02 2020-03 2020-04 2020-05 L2
#0 2020-01 24 24 64 22 11 24.0
#1 2020-02 42 42 24 11 35 132.0
#2 2020-03 18 18 70 44 74 154.0
#3 2020-04 68 68 70 3 12 187.0
#4 2020-05 24 24 88 5 69 89.0
#5 2020-06 30 30 57 78 51 203.0

I tried another method.
this method uses reshape long(in python : melt), but I applyed reshape long twice in python because time frequency of sales and other columns in df is monthly and not daily, so I did reshape long one more time to make int column corresponding to monthly date.
(I have used Stata more often than python, in Stata, I can only do reshape long one time because it has monthly time frequency, and reshape task is much easier than that of pandas, python)
if you are interested, take a look
# 00.module
import pandas as pd
import numpy as np
from order import order # https://stackoverflow.com/a/68464246/16478699
# 0.make a dataset
df = pd.DataFrame({'sales': ['2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06'],
'2020-01': [24, 42, 18, 68, 24, 30],
'2020-02': [24, 42, 18, 68, 24, 30],
'2020-03': [64, 24, 70, 70, 88, 57],
'2020-04': [22, 11, 44, 3, 5, 78],
'2020-05': [11, 35, 74, 12, 69, 51]}
)
df.to_stata('dataset.dta', version=119, write_index=False)
print(df)
# 1.reshape long(in python: melt)
t = list(df.columns)
t.remove('sales')
df_long = df.melt(id_vars='sales', value_vars=t, var_name='var', value_name='val')
df_long['id'] = list(range(1, df_long.shape[0] + 1)) # make id for another resape long
print(df_long)
# 2.another reshape long(in python: melt, reason: make int(col name: tid) corresponding to monthly date of sales and monthly columns in df)
df_long2 = df_long.melt(id_vars=['id', 'val'], value_vars=['sales', 'var'])
df_long2['tid'] = df_long2['value'].apply(lambda x: 1 + list(df_long2.value.unique()).index(x))
print(df_long2)
# 3.back to wide form with tid(in python: pd.pivot)
df_wide = pd.pivot(df_long2, index=['id', 'val'], columns='variable', values=['value', 'tid'])
df_wide.columns = df_wide.columns.map(lambda x: x[1] if x[0] == 'value' else f'{x[0]}_{x[1]}') # change multiindex columns name into just normal columns name
df_wide = df_wide.reset_index()
print(df_wide)
# 4.make values of L2
for i in df_wide.tid_sales.unique():
if list(df_wide.tid_sales.unique()).index(i) + 1 == len(df_wide.tid_sales.unique()):
df_wide.loc[df_wide['tid_sales'] == i, 'L2'] = df_wide.loc[(((df_wide['tid_sales'] == i) | (
df_wide['tid_sales'] == i - 1)) & ((df_wide['tid_var'] == i - 1) | (
df_wide['tid_var'] == i - 2))), 'val'].sum()
else:
df_wide.loc[df_wide['tid_sales'] == i, 'L2'] = df_wide.loc[(((df_wide['tid_sales'] == i) | (
df_wide['tid_sales'] == i - 1)) & ((df_wide['tid_var'] == i) | (
df_wide['tid_var'] == i - 1))), 'val'].sum()
print(df_wide)
# 5.back to shape of df with L2(reshape wide, in python: pd.pivot)
df_final = df_wide.drop(columns=df.filter(regex='^tid')) # no more columns starting with tid needed
df_final = pd.pivot(df_final, index=['sales', 'L2'], columns='var', values='val').reset_index()
df_final = order(df_final, 'L2', f_or_l='last') # order function is made by me
print(df_final)

Related

Optimize dataframe fill and refill Python Pandas

I have changed the column names and have added new columns too.
I am having a numpy array that I have to fill in the respective dataframe columns.
I am getting a delayed response in filling the dataframe using the following code:
import pandas as pd
import numpy as np
df = pd.read_csv("sample.csv")
df = df.tail(1000)
DISPLAY_IN_TRAINING = []
Slice_Middle_Piece_X = slice(None,-1, None)
Slice_Middle_Piece_Y = slice(-1, None)
input_slicer = slice(None, None)
output_slice = slice(None, None)
seq_len = 15 # choose sequence length
n_steps = seq_len - 1
Disp_Data = df
def Generate_DataSet(stock,
df_clone,
seq_len
):
global DISPLAY_IN_TRAINING
data_raw = stock.values # convert to numpy array
data = []
len_data_raw = data_raw.shape[0]
for index in range(0, len_data_raw - seq_len + 1):
data.append(data_raw[index: index + seq_len])
data = np.array(data);
test_set_size = int(np.round(30 / 100 * data.shape[0]));
train_set_size = data.shape[0] - test_set_size;
x_train, y_train = Get_Data_Chopped(data[:train_set_size])
print("Training Sliced Successful....!")
df_train_candle = df_clone[n_steps : train_set_size + n_steps]
if len(DISPLAY_IN_TRAINING) == 0:
DISPLAY_IN_TRAINING = list(df_clone)
df_train_candle.columns = DISPLAY_IN_TRAINING
return [x_train, y_train, df_train_candle]
def Get_Data_Chopped(data_related_to):
x_values = []
y_values = []
for index,iter_values in enumerate(data_related_to):
x_values.append(iter_values[Slice_Middle_Piece_X,input_slicer])
y_values.append([item for sublist in iter_values[Slice_Middle_Piece_Y,output_slice] for item in sublist])
x_values = np.asarray(x_values)
y_values = np.asarray(y_values)
return [x_values,y_values]
x_train, y_train, df_train_candle = Generate_DataSet(df,
Disp_Data,
seq_len
)
df_train_candle.reset_index(drop = True, inplace = True)
df_columns = list(df_train_candle)
df_outputs_name = []
OUTPUT_COLUMN = df.columns
for output_column_name in OUTPUT_COLUMN:
df_outputs_name.append(output_column_name + "_pred")
for i in range(len(df_columns)):
if df_columns[i] == output_column_name:
df_columns[i] = output_column_name + "_orig"
break
df_train_candle.columns = df_columns
df_pred_names = pd.DataFrame(columns = df_outputs_name)
df_train_candle = df_train_candle.join(df_pred_names, how="outer")
for row_index, row_value in enumerate(y_train):
for valueindex, output_label in enumerate(OUTPUT_COLUMN):
df_train_candle.loc[row_index, output_label + "_orig"] = row_value[valueindex]
df_train_candle.loc[row_index, output_label + "_pred"] = row_value[valueindex]
print(df_train_candle.head())
The shape of my y_train is (195, 24) and the dataframe shape is (195, 48). Now I am trying to optimize and make the process work faster. The y_train may change shape to say (195, 1) or (195, 5).
So please can someone tell me what other way (optimized way) for doing the above process? I want a general solution that could fit anything without loosing the data integrity and is faster too.
If teh data size increases from 1000 to 2000 the process become slow. Please advise how to make it faster.
Sample Data df looks like this with shape (1000, 8)
A B C D E F G H
64272 195 215 239 272 22 11 33 55
64273 196 216 240 273 22 11 33 55
64274 197 217 241 274 22 11 33 55
64275 198 218 242 275 22 11 33 55
64276 199 219 243 276 22 11 33 55
The output looks like this:
A_orig B_orig C_orig D_orig E_orig F_orig G_orig H_orig A_pred B_pred C_pred D_pred E_pred F_pred G_pred H_pred
0 10 30 54 87 22 11 33 55 10 30 54 87 22 11 33 55
1 11 31 55 88 22 11 33 55 11 31 55 88 22 11 33 55
2 12 32 56 89 22 11 33 55 12 32 56 89 22 11 33 55
3 13 33 57 90 22 11 33 55 13 33 57 90 22 11 33 55
4 14 34 58 91 22 11 33 55 14 34 58 91 22 11 33 55
Please generate csv columns with 1000 or more lines and see that the program becomes slower. I want to make it faster. I hope this is good to go for understanding.

How to calculate cumulative sum and average on file data in python

I have a below data in file
NAME,AGE,MARKS
A1,12,40
B1,13,54
C1,15,67
D1,11,41
E1,16,59
F1,10,60
If the data was in database table , I would have used Sum and Average function to get the cumulative sum and average
But How to get it with python is a bit challenging , As i am learner
Expected output :
NAME,AGE,MARKS,CUM_SUM,AVG
A1,12,40,40,40
B1,13,54,94,47
C1,15,67,161,53.66
D1,11,41,202,50.5
E1,16,59,261,43.5
F1,10,60,321,45.85
IIUC use:
df = pd.read_csv('file')
df['CUM_SUM'] = df['MARKS'].cumsum()
df['AVG'] = df['MARKS'].expanding().mean()
print (df)
NAME AGE MARKS CUM_SUM AVG
0 A1 12 40 40 40.000000
1 B1 13 54 94 47.000000
2 C1 15 67 161 53.666667
3 D1 11 41 202 50.500000
4 E1 16 59 261 52.200000
5 F1 10 60 321 53.500000
Last use:
df.to_csv('file.csv', index=False)
Or:
out = df.to_string(index=False)

Manipulating data in Pandas

That is my database:
Number Name Points Math Points BG Wish
0 1 Огнян 50 65 MT
1 2 Момчил 61 27 MT
2 3 Радослав 68 68 MT
3 4 Павел 28 16 MT
4 10 Виктор 67 76 MT
5 11 Петър 26 68 BT
6 12 Антон 64 58 BT
7 13 Васил 29 42 BT
8 20 Виктория 62 67 BT
That's my code:
df = pd.read_csv('Input_data.csv', encoding='utf-8-sig')
df['Total'] = df.iloc[:, 2:].sum(axis=1)
df = df.sort_values(['Total', 'Name'], ascending=[0, 1])
df_5.to_excel("BT RANKING_5.xlsx", encoding='utf-8-sig', index=False)
I want for each person who has Wish == MT to double the score in Points Math column.
I tried:
df.loc[df['Wish'] == 'MT', 'Points Math'] = df.loc[df['Points Math'] * 2]
but this didn't work. I als tried to do an if statement, for loop but they didn't work either.
What's the appropriate sytax to do the logic?
Use this:
df['Points_Math'] = np.where(df['Wish'] == 'MT', df['Points Math'] * 2, df['Points Math'])
A new column would be created 'Points_Math' with desired results or you can overwrite by replacing 'Points_Math' with 'Points Math'

How to find the index of a value by row in a dataframe in python and extract the value of the following column

I have the following dataframe using pandas
df = pd.DataFrame({'Last_Name': ['Smith', None, 'Brown'],
'Date0': ['01/01/1999','01/06/1999','01/01/1979'], 'Age0': [29,44,21],
'Date1': ['08/01/1999','07/01/2014','01/01/2016'],'Age1': [35, 45, 47],
'Date2': [None,'01/06/2035','08/01/1979'],'Age2': [47, None, 74],
'Last_age': [47,45,74]})
I would like to add new column to get the date corresponding to the value presents in 'Last_age' for each row to get something like that :
df = pd.DataFrame({'Last_Name': ['Smith', None, 'Brown'],
'Date0': ['01/01/1999','01/06/1999','01/01/1979'], 'Age0': [29,44,21],
'Date1': ['08/01/1999','07/01/2014','01/01/2016'],'Age1': [35, 45, 47],
'Date2': [None,'01/06/2035','08/01/1979'],'Age2': [47, None, 74],
'Last_age': [47,45,74],
'Last_age_date': ['Error no date','07/01/2014','08/01/1979']})
I will just using wide_to_long reshape your df
s=pd.wide_to_long(df.reset_index(),['Date','Age'],i=['Last_age','index'],j='Drop')
s.loc[s.Age==s.index.get_level_values(0),'Date']
Out[199]:
Last_age index Drop
47 0 2 None
45 1 1 07/01/2014
74 2 2 08/01/1979
Name: Date, dtype: object
df['Last_age_date']=s.loc[s.Age==s.index.get_level_values(0),'Date'].values
df
Out[201]:
Last_Name Date0 Age0 ... Age2 Last_age Last_age_date
0 Smith 01/01/1999 29 ... 47.0 47 None
1 None 01/06/1999 44 ... NaN 45 07/01/2014
2 Brown 01/01/1979 21 ... 74.0 74 08/01/1979
[3 rows x 9 columns]
Something like this should do what you are looking for:
# get the age and column rows (you might have more than just the 2)
age_columns = [c for c in df.columns if 'Age' in c][::-1]
date_columns = [c for c in df.columns if 'Date' in c][::-1]
def get_last_age_date(row):
for age, date in zip(age_columns, date_columns):
if not np.isnan(row[age]):
return row[date]
return np.nan
# apply the function to all the rows in the dataframe
df['Last_age_date'] = df.apply(lambda row: get_last_age_date(row), axis=1)
# fix the NaN values to say 'Error no date'
df.Last_age_date.where(~df.Last_age_date.isna(), 'Error no date', inplace=True)
print(df)
Welcome to Stackoverflow! You can write a small function and achieve this. Your input dataframe looks like this.
df
Last_Name Date0 Age0 Date1 Age1 Date2 Age2 Last_age
0 Smith 01/01/1999 29 08/01/1999 35 None 47.0 47
1 None 01/06/1999 44 07/01/2014 45 01/06/2035 NaN 45
2 Brown 01/01/1979 21 01/01/2016 47 08/01/1979 74.0 74
Write a function like this:
def last_Age(row):
if row['Last_age'] == row['Age2']:
return row['Date2']
elif row['Last_age'] == row['Age1']:
return row['Date1']
elif row['Last_age'] == row['Age0']:
return row['Date0']
df['Last_age_date']=df.apply(last_Age, axis = 1)
df
Last_Name Date0 Age0 Date1 Age1 Date2 Age2 Last_age Last_age_date
0 Smith 01/01/1999 29 08/01/1999 35 None 47.0 47 None
1 None 01/06/1999 44 07/01/2014 45 01/06/2035 NaN 45 07/01/2014
2 Brown 01/01/1979 21 01/01/2016 47 08/01/1979 74.0 74 08/01/1979

Modify and round numbers in a pandas dataframe in Python

Long story short, I have a csv file which I read as a pandas dataframe. The file contains a weather report, but all of the measurements for temperature are in Fahrenheit. I've figured out how to convert them:
import pandas as np
df = np.read_csv('report.csv')
df['average temperature'] = (df['average temperature'] - 32) * 5/9
But then the data for this column is in decimals up to 6 points.
I've found code that will round up all the data in the dataframe, but I need only this column.
df.round(2)
I don't like how it has to be a separate piece of code on a separate line and how it modifies all of my data. Is there a way to go about this problem more elegantly? Is there a way to apply this to other columns in my dataframe, such as maximum temperature and minimum temperature without having to copy the above piece of code?
For round only some columns use subset:
cols = ['maximum temperature','minimum temperature','average temperature']
df[cols] = df[cols].round(2)
If want convert only some columns from list:
cols = ['maximum temperature','minimum temperature','average temperature']
df[cols] = ((df[cols] - 32) * 5/9).round(2)
If want round each column separately:
df['average temperature'] = df['average temperature'].round(2)
df['maximum temperature'] = df['maximum temperature'].round(2)
df['minimum temperature'] = df['minimum temperature'].round(2)
Sample:
df = (pd.DataFrame(np.random.randint(30, 100, (10, 3)),
columns=['maximum temperature','minimum temperature','average temperature'])
.assign(a='m', b=range(10)))
print (df)
maximum temperature minimum temperature average temperature a b
0 97 60 98 m 0
1 64 86 64 m 1
2 32 64 95 m 2
3 60 56 93 m 3
4 43 89 64 m 4
5 40 62 86 m 5
6 37 40 70 m 6
7 61 33 46 m 7
8 36 44 46 m 8
9 63 30 33 m 9
cols = ['maximum temperature','minimum temperature','average temperature']
df[cols] = ((df[cols] - 32) * 5/9).round(2)
print (df)
maximum temperature minimum temperature average temperature a b
0 36.11 15.56 36.67 m 0
1 17.78 30.00 17.78 m 1
2 0.00 17.78 35.00 m 2
3 15.56 13.33 33.89 m 3
4 6.11 31.67 17.78 m 4
5 4.44 16.67 30.00 m 5
6 2.78 4.44 21.11 m 6
7 16.11 0.56 7.78 m 7
8 2.22 6.67 7.78 m 8
9 17.22 -1.11 0.56 m 9
Here's a single line solution with apply and a conversion function.
def convert_to_celsius (f):
return 5.0/9.0*(f-32)
df[['Column A','Column B']] = df[['Column A','Column B']].apply(convert_to_celsius).round(2)

Categories

Resources