Why group by is too slow in large data

Why group by is too slow in large data - python

I built up this code to group my data by total, month, week, day, hour, and 10 min. There is a problem when I want to run it in large data that has more than 20 million lines, it takes too long and did not finish even after 20 hours. I tried to split the file into small files but it is still slow.
Can you please check my code and is it possible to speed up the process?
def my_agg1(x):
names = {
'Total_in_averge_amount': x['value'].mean(),
'Total_in_max_amount': x['value'].max(),
'Total_in_min_amount': x['value'].min(),
'Total_in_totalamount': x['value'].sum(),
'Total_in_standard_deviation': x['value'].std(),
'Total_in_degree': x['inputs'].sum(),
'duration': x['date'].max() - x['date'].min()}
return pd.Series(names, index=['Total_in_totalamount', 'Total_in_degree', 'Total_in_averge_amount', 'Total_in_max_amount', 'Total_in_min_amount', 'Total_in_standard_deviation', 'duration'])
# Read CSV File
df = pd.read_csv('inputs_header_2.csv', encoding="ISO-8859-1")
# Total
groupTotal = df.groupby(['user']).apply(my_agg1)
c = abs(groupTotal['duration'].astype('timedelta64[D]'))
groupTotal['Total_in_standard_deviation'].fillna('0', inplace=True)
groupTotal['DurationInMin'] = (c * 24) * 60
TotalinDegree = groupTotal['Total_in_degree']
InTotalAmount = groupTotal['Total_in_totalamount']
InTransactionRate = (TotalinDegree / c)
groupTotal['Total_in_transaction_rate'] = round(InTransactionRate, 2)
groupTotal['Total_average_in_speed'] = InTotalAmount / c
Acceleration = (InTotalAmount / c ** 2)
groupTotal['Total_in-acceleration'] = Acceleration
groupTotal.replace(np.inf, 0, inplace=True)
groupTotal = groupTotal.drop(groupTotal.columns[[6]], axis=1)
#groupTotal.to_csv(r'fr1.csv', sep='\t', float_format='%.8f')
print(groupTotal)
groupTotal.to_csv(r'total.csv', sep='\t', float_format='%.8f')
# print(groupTotal)
#################################################################################
# Monthly
monthly_group_result = (df.set_index('date')
.groupby('user')
.resample('M', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
monthly_group_result.columns = monthly_group_result.columns.map('_'.join)
d_monthly = {'value_max': 'Monthly_in_max_amount',
'value_min': 'Monthly_in_min_amount',
'value_sum': 'Monthly_in_totalamount',
'inputs_sum': 'Monthly_in_degree',
'value_std': 'Monthly_in_standard_deviation',
'value_mean': 'Monthly_in_averge_amount'}
monthly_group = monthly_group_result.rename(columns=d_monthly).reset_index()
['Monthly_in_degree'].transform('max') == group['Monthly_in_degree']]
MonthlygroupUser = monthly_group.loc[monthly_group.groupby('user')['Monthly_in_degree'].idxmax()]
MonthlygroupUser['Monthly_in_standard_deviation'].fillna('0', inplace=True)
MonthlyinDegree = MonthlygroupUser['Monthly_in_degree']
MonthlyInTotalAmount = MonthlygroupUser['Monthly_in_totalamount']
Monthly_date = MonthlygroupUser['date'].dt.daysinmonth
MonthlyInTransactionRate = (MonthlyinDegree / Monthly_date)
MonthlygroupUser['Monthly_in_transaction_rate'] = round(MonthlyInTransactionRate, 3)
MonthlygroupUser['Monthly_average_in_speed'] = MonthlyInTotalAmount / Monthly_date
MonthlyAcceleration = (MonthlyInTotalAmount / Monthly_date ** 2)
MonthlygroupUser['Monthly_in-acceleration'] = MonthlyAcceleration
MonthlygroupUser = MonthlygroupUser.drop(MonthlygroupUser.columns[[1, 5]], axis=1)
# Weekly
Weekly_group_result = (df.set_index('date')
.groupby('user')
.resample('7D', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
Weekly_group_result.columns = Weekly_group_result.columns.map('_'.join)
d_weekly = {'value_max': 'Weekly_in_max_amount',
'value_min': 'Weekly_in_min_amount',
'value_sum': 'Weekly_in_totalamount',
'inputs_sum': 'Weekly_in_degree',
'value_std': 'Weekly_in_standard_deviation',
'value_mean': 'Weekly_in_averge_amount'}
Weeklygroup = Weekly_group_result.rename(columns=d_weekly).reset_index()
WeeklygroupUser = Weeklygroup.loc[Weeklygroup.groupby('user')['Weekly_in_degree'].idxmax()]
WeeklygroupUser['Weekly_in_standard_deviation'].fillna('0', inplace=True)
WeeklyinDegree = WeeklygroupUser['Weekly_in_degree']
WeeklyInTotalAmount = WeeklygroupUser['Weekly_in_totalamount']
WeeklyInTransactionRate = (WeeklyinDegree / 7)
WeeklygroupUser['Weekly_in_transaction_rate'] = round(WeeklyInTransactionRate, 3)
WeeklygroupUser['Weekly_average_in_speed'] = WeeklyInTotalAmount / 7
WeeklyAcceleration = (WeeklyInTotalAmount / 7 ** 2)
WeeklygroupUser['Weekly_in-acceleration'] = WeeklyAcceleration
WeeklygroupUser = WeeklygroupUser.drop(WeeklygroupUser.columns[[1, 4]], axis=1)
# daily
Daily_group_result = (df.set_index('date')
.groupby('user')
.resample('D', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
Daily_group_result.columns = Daily_group_result.columns.map('_'.join)
d_daily = {'value_max': 'Daily_in_max_amount',
'value_min': 'Daily_in_min_amount',
'value_sum': 'Daily_in_totalamount',
'inputs_sum': 'Daily_in_degree',
'value_std': 'Daily_in_standard_deviation',
'value_mean': 'Daily_in_averge_amount'}
Dailygroup = Daily_group_result.rename(columns=d_daily).reset_index()
DailygroupUser = Dailygroup.loc[Dailygroup.groupby('user')['Daily_in_degree'].idxmax()]
DailygroupUser['Daily_in_standard_deviation'].fillna('0', inplace=True)
DailyinDegree = DailygroupUser['Daily_in_degree']
DailyInTotalAmount = DailygroupUser['Daily_in_totalamount']
DailyInTransactionRate = (DailyinDegree / 1)
DailygroupUser['Daily_in_transaction_rate'] = round(DailyInTransactionRate, 3)
DailygroupUser['Daily_average_in_speed'] = DailyInTotalAmount / 1
DailyAcceleration = (DailyInTotalAmount / 1 ** 2)
DailygroupUser['Daily_in-acceleration'] = DailyAcceleration
DailygroupUser = DailygroupUser.drop(DailygroupUser.columns[[1, 4]], axis=1)
# Hourly
# group all users by hour
Hourly_group_result = (df.set_index('date')
.groupby('user')
.resample('H', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
Hourly_group_result.columns = Hourly_group_result.columns.map('_'.join)
d_hourly = {'value_max': 'Hourly_in_max_amount',
'value_min': 'Hourly_in_min_amount',
'value_sum': 'Hourly_in_totalamount',
'inputs_sum': 'Hourly_in_degree',
'value_std': 'Hourly_in_standard_deviation',
'value_mean': 'Hourly_in_averge_amount'}
Hourlygroup = Hourly_group_result.rename(columns=d_hourly).reset_index()
HourlygroupUser = Hourlygroup.loc[Hourlygroup.groupby('user')['Hourly_in_degree'].idxmax()]
)
HourlygroupUser['Hourly_in_standard_deviation'].fillna('0', inplace=True)
HourlyinDegree = HourlygroupUser['Hourly_in_degree']
HourlyInTotalAmount = HourlygroupUser['Hourly_in_totalamount']
HourlyInTransactionRate = (HourlyinDegree / (1 / 24))
HourlygroupUser['Hourly_in_transaction_rate'] = round(HourlyInTransactionRate, 3)
HourlygroupUser['Hourly_average_in_speed'] = HourlyInTotalAmount / 1
HourlyAcceleration = (HourlyInTotalAmount / (1 / 24) ** 2)
HourlygroupUser['Hourly_in-acceleration'] = HourlyAcceleration
HourlygroupUser = HourlygroupUser.drop(HourlygroupUser.columns[[1, 4]], axis=1)
# group all users by 10Min
TenMin_group_result = (df.set_index('date')
.groupby('user')
.resample('10Min', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
TenMin_group_result.columns = TenMin_group_result.columns.map('_'.join)
d_10Min = {'value_max': 'TenMin_in_max_amount',
'value_min': 'TenMin_in_min_amount',
'value_sum': 'TenMin_in_totalamount',
'inputs_sum': 'TenMin_in_degree',
'value_std': 'TenMin_in_standard_deviation',
'value_mean': 'TenMin_in_averge_amount'}
TenMingroup = TenMin_group_result.rename(columns=d_10Min).reset_index()
TenMingroupUser = TenMingroup.loc[TenMingroup.groupby('user')['TenMin_in_degree'].idxmax()]
TenMingroupUser['TenMin_in_standard_deviation'].fillna('0', inplace=True)
TenMininDegree = TenMingroupUser['TenMin_in_degree']
TenMinInTotalAmount = TenMingroupUser['TenMin_in_totalamount']
TenMinInTransactionRate = (TenMininDegree / (1 / 24 / 6))
TenMingroupUser['TenMin_in_transaction_rate'] = round(TenMinInTransactionRate, 3)
TenMingroupUser['TenMin_average_in_speed'] = TenMinInTotalAmount / (1 / 24 / 6)
TenMinAcceleration = ((TenMinInTotalAmount / (1 / 24 / 6) ** 2)) - (1 / 24 / 6)
TenMingroupUser['TenMin_in-acceleration'] = TenMinAcceleration
TenMingroupUser = TenMingroupUser.drop(TenMingroupUser.columns[[1, 4]], axis=1)
#TenMingroupUser.to_csv(r'fr1.csv', sep='\t', float_format='%.8f')
FinalTotal = pd.merge(groupTotal, MonthlygroupUser, on='user').merge(WeeklygroupUser, on='user').merge(DailygroupUser, on='user').merge(HourlygroupUser, on='user').merge(TenMingroupUser, on='user')
FinalTotal.to_csv(r'Final_Inputs_2.csv', sep='\t', float_format='%.8f', index=False)
Thank you for any help
Regards,
Khaled

Related

Python Excel Calculations

I am not getting correct calculations for 3 columns I am trying to write on a data sheet for specific date and time in time series data. I want to calculate difference between various times and the closing price time. I am having problem for some reason I can't get correct output for the calculations.
This is the output from this code.
import pandas as pd
import os
import numpy as np
from openpyxl import Workbook
# Read the data into a Pandas DataFrame
directory_path = "C:/Users/bean/Desktop/_L"
os.chdir(directory_path)
book = Workbook()
book.remove(book.active) # remove the first sheet
for file in os.listdir(directory_path):
if file.endswith(".csv"):
file_path = os.path.join(directory_path, file)
df = pd.read_csv(file_path)
# Create a new DataFrame for each file
df_diff = df[['Date', 'CET', 'NA', 'UTC', 'Name', 'BOLLBU', 'BOLLBM', 'BOLLBL',
'VWAP', 'VWAPSD1U', 'VWAPSD1L', 'VWAPSD2U', 'VWAPSD2L', 'ATR', 'ATRMA']]
df['Date'] = pd.to_datetime(df['Date'])
df['CET'] = pd.to_datetime(df['Date'])
df['UTC'] = pd.to_datetime(df['Date'])
df['NA'] = pd.to_datetime(df['Date'])
df_diff['Date'] = pd.to_datetime(df['Date'])
df_diff['CET'] = pd.to_datetime(df['CET'])
df_diff['UTC'] = pd.to_datetime(df['UTC'])
df_diff['NA'] = pd.to_datetime(df['NA'])
df_diff['Open'] = df['Open']
df_diff['High'] = df['High']
df_diff['Low'] = df['Low']
df_diff['Close'] = df['Close']
# Calculate the differences and add them as new columns
df_diff['Open Diff'] = (df['Open'].shift(-1) -
df['Open']) / df['Open'] * 100
df_diff['High Diff'] = (df['High'].shift(-1) -
df['High']) / df['High'] * 100
df_diff['Low Diff'] = (df['Low'].shift(-1) -
df['Low']) / df['Low'] * 100
df_diff['Close Diff'] = (
df['Close'].shift(-1) - df['Close']) / df['Close'] * 100
df_1635 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 35)].sort_values(by='Date', ascending=False)
df_1625 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 25)].sort_values(by='Date', ascending=False)
df_1620 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 20)].sort_values(by='Date', ascending=False)
df_1615 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 15)].sort_values(by='Date', ascending=False)
df_1610 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 10)].sort_values(by='Date', ascending=False)
df_1605 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 5)].sort_values(by='Date', ascending=False)
df_1600 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1545 = df[(df['Date'].dt.hour == 15) & (
df['Date'].dt.minute == 45)].sort_values(by='Date', ascending=False)
df_1530 = df[(df['Date'].dt.hour == 15) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_1500 = df[(df['Date'].dt.hour == 15) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1445 = df[(df['Date'].dt.hour == 14) & (
df['Date'].dt.minute == 45)].sort_values(by='Date', ascending=False)
df_1430 = df[(df['Date'].dt.hour == 14) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_1400 = df[(df['Date'].dt.hour == 14) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1330 = df[(df['Date'].dt.hour == 13) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_1300 = df[(df['Date'].dt.hour == 13) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1230 = df[(df['Date'].dt.hour == 12) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_0800 = df[(df['Date'].dt.hour == 8) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
# Calculate difference between Close price of df_1635 and other DataFrames
df_diff_1635_1625 = df_1635['Close'] - df_1625['Close']
df_diff_1635_1620 = df_1635['Close'].subtract(df_1620['Close'])
df_diff_1635_1615 = df_1635['Close'].subtract(df_1615['Close'])
df_diff_1635_1610 = df_1635['Close'].subtract(df_1610['Close'])
df_diff_1635_1605 = df_1635['Close'].subtract(df_1605['Close'])
df_diff_1635_1600 = df_1635['Close'].subtract(df_1600['Close'])
df_diff_1635_1545 = df_1635['Close'].subtract(df_1545['Close'])
df_diff_1635_1530 = df_1635['Close'].subtract(df_1530['Close'])
df_diff_1635_1500 = df_1635['Close'].subtract(df_1500['Close'])
df_diff_1635_1445 = df_1635['Close'].subtract(df_1445['Close'])
df_diff_1635_1430 = df_1635['Close'].subtract(df_1430['Close'])
df_diff_1635_1400 = df_1635['Close'].subtract(df_1400['Close'])
df_diff_1635_1330 = df_1635['Close'].subtract(df_1330['Close'])
df_diff_1635_1300 = df_1635['Close'].subtract(df_1300['Close'])
df_diff_1635_1230 = df_1635['Close'].subtract(df_1230['Close'])
df_diff_1635_0800 = df_1635['Close'].subtract(df_0800['Close'])
print(df_diff_1635_1625)
# Add Difference, Percent_Diff, and U/D columns to each DataFrame
df_1635['Difference'] = df_1635['Close'].subtract(
df_1635['Close'].shift())
df_1635['Percent_Diff'] = (df_1635['Difference'] /
df_1635['Close']) * 100
df_1635['U/D'] = np.where(df_1635['Difference'] > 0, 'U', 'D')
df_1625['Difference'] = df_diff_1635_1625
df_1625['Percent_Diff'] = (df_diff_1635_1625 / df_1635['Close']) * 100
df_1625['U/D'] = np.where(df_1625['Percent_Diff'] > 0, 'U', 'D')
print(df_1625.dtypes)
df_1620['Difference'] = df_diff_1635_1620
df_1620['Percent_Diff'] = (df_diff_1635_1620 / df_1635['Close']) * 100
df_1620['U/D'] = np.where(df_1620['Percent_Diff'] > 0, 'U', 'D')
df_1615['Difference'] = df_diff_1635_1615
df_1615['Percent_Diff'] = (df_diff_1635_1615 / df_1635['Close']) * 100
df_1615['U/D'] = np.where(df_1615['Percent_Diff'] > 0, 'U', 'D')
df_1610['Difference'] = df_diff_1635_1610
df_1610['Percent_Diff'] = (df_diff_1635_1610 / df_1635['Close']) * 100
df_1610['U/D'] = np.where(df_1610['Percent_Diff'] > 0, 'U', 'D')
df_1605['Difference'] = df_diff_1635_1605
df_1605['Percent_Diff'] = (df_diff_1635_1605 / df_1635['Close']) * 100
df_1605['U/D'] = np.where(df_1605['Percent_Diff'] > 0, 'U', 'D')
df_1600['Difference'] = df_diff_1635_1600
df_1600['Percent_Diff'] = (df_diff_1635_1600 / df_1635['Close']) * 100
df_1600['U/D'] = np.where(df_1600['Percent_Diff'] > 0, 'U', 'D')
df_1545['Difference'] = df_diff_1635_1545
df_1545['Percent_Diff'] = (df_diff_1635_1545 / df_1635['Close']) * 100
df_1545['U/D'] = np.where(df_1545['Percent_Diff'] > 0, 'U', 'D')
df_1530['Percent_Diff'] = (df_diff_1635_1530 / df_1635['Close']) * 100
df_1530['U/D'] = np.where(df_1530['Percent_Diff'] > 0, 'U', 'D')
df_1500['Difference'] = df_diff_1635_1500
df_1500['Percent_Diff'] = (df_diff_1635_1500 / df_1635['Close']) * 100
df_1500['U/D'] = np.where(df_1500['Percent_Diff'] > 0, 'U', 'D')
df_1445['Difference'] = df_diff_1635_1445
df_1445['Percent_Diff'] = (df_diff_1635_1445 / df_1635['Close']) * 100
df_1445['U/D'] = np.where(df_1445['Percent_Diff'] > 0, 'U', 'D')
df_1430['Difference'] = df_diff_1635_1430
df_1430['Percent_Diff'] = (df_diff_1635_1430 / df_1635['Close']) * 100
df_1430['U/D'] = np.where(df_1430['Percent_Diff'] > 0, 'U', 'D')
df_1400['Difference'] = df_diff_1635_1400
df_1400['Percent_Diff'] = (df_diff_1635_1400 / df_1635['Close']) * 100
df_1400['U/D'] = np.where(df_1400['Percent_Diff'] > 0, 'U', 'D')
df_1330['Difference'] = df_diff_1635_1330
df_1330['Percent_Diff'] = (df_diff_1635_1330 / df_1635['Close']) * 100
df_1330['U/D'] = np.where(df_1330['Percent_Diff'] > 0, 'U', 'D')
df_1300['Difference'] = df_diff_1635_1300
df_1300['Percent_Diff'] = (df_diff_1635_1300 / df_1635['Close']) * 100
df_1300['U/D'] = np.where(df_1300['Percent_Diff'] > 0, 'U', 'D')
df_1230['Difference'] = df_diff_1635_1230
df_1230['Percent_Diff'] = (df_diff_1635_1230 / df_1635['Close']) * 100
df_1230['U/D'] = np.where(df_1230['Percent_Diff'] > 0, 'U', 'D')
df_0800['Difference'] = df_diff_1635_0800
df_0800['Percent_Diff'] = (df_diff_1635_0800 / df_1635['Close']) * 100
df_0800['U/D'] = np.where(df_0800['Percent_Diff'] > 0, 'U', 'D')
df_25 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 25)].sort_values(by='Date', ascending=False)
df_35 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 35)].sort_values(by='Date', ascending=False)
# Concat all results for each time into this sheet.
df_35 = df_35[['Date', 'CET', 'NA', 'UTC', 'Name', 'Open', 'Open Diff', 'High', 'High Diff', 'Low', 'Low Diff',
'Close', 'Close Diff', 'BOLLBU', 'BOLLBM', 'BOLLBL', 'VWAP', 'VWAPSD1U', 'VWAPSD1L', 'VWAPSD2U',
'VWAPSD2L', 'ATR', 'ATRMA']]
df_diff = df_diff.sort_values(by='Date', ascending=False)
df_diff = df_diff[['Date', 'CET', 'NA', 'UTC', 'Name', 'Open', 'Open Diff', 'High', 'High Diff', 'Low', 'Low Diff',
'Close', 'Close Diff', 'BOLLBU', 'BOLLBM', 'BOLLBL', 'VWAP', 'VWAPSD1U', 'VWAPSD1L', 'VWAPSD2U',
'VWAPSD2L', 'ATR', 'ATRMA']]
writer = pd.ExcelWriter(
f'{file.split(".")[0]}.xlsx', engine='openpyxl')
df_diff.to_excel(writer, sheet_name='df_diff', index=False, startrow=0)
df_35.to_excel(writer, sheet_name='Sheet_35min', index=False)
dataframes = [df_1625, df_1620, df_1615, df_1610, df_1605, df_1600, df_1545,
df_1530, df_1500, df_1445, df_1430, df_1400, df_1330, df_1300, df_1230, df_0800]
for i, df in enumerate(dataframes):
df.to_excel(writer, sheet_name=f"df_{i}", index=False)
writer.save()
Essentially, the calculations under the for loop and for df_35 are not coming out properly correctly? How am I doing the operations wrong? The date is datetime but I am accessing column in that specific time value so I don't understand why it doesn't work. I tried various ways here are a few calculation methods I tried that were still wrong,
Neither of these work
df_diff_1635_1625 = df_1635['Close'] - df_1625['Close']
df_diff_1635_1620 = df_1635['Close'].subtract(df_1620['Close'])
all my columns are mostly float64 including close ones except date which is datetime. i check and print the calculation i get nan values so its clearly not processing it.
`

How can we find an optimal point between minimizing expenses and maximizing revenues?

I have this dataframe.
import pandas as pd
import numpy as np
from cvxpy import *
data = [{'Month': '2020-01-01', 'Expense':1000, 'Revenue':-50000, 'Building':'Stadium'},
{'Month': '2020-02-01', 'Expense':3000, 'Revenue':40000, 'Building':'Stadium'},
{'Month': '2020-03-01', 'Expense':7000, 'Revenue':50000, 'Building':'Stadium'},
{'Month': '2020-04-01', 'Expense':3000, 'Revenue':40000, 'Building':'Stadium'},
{'Month': '2020-01-01', 'Expense':5000, 'Revenue':-6000, 'Building':'Casino'},
{'Month': '2020-02-01', 'Expense':5000, 'Revenue':4000, 'Building':'Casino'},
{'Month': '2020-03-01', 'Expense':5000, 'Revenue':9000, 'Building':'Casino'},
{'Month': '2020-04-01', 'Expense':6000, 'Revenue':10000, 'Building':'Casino'}]
df = pd.DataFrame(data)
df
Now, I am trying to minimize the risk of different investments (in this case Stadiums and Casinos), based on the revenues and expenses of each investment. I am testing this code.
# mean return
r = df.pivot_table(index='Month', columns='Building', values='mth_change', aggfunc='mean').mean()
# covariance matrix
C = np.asmatrix(np.cov(r))
# Get symbols
cols = df.Building.unique()
symbols = cols
# Number of variables
n = len(symbols)
# The variables vector
x = Variable(n)
# The minimum return
req_return = 0.02
# The return
ret = r.T*x
# The risk in xT.Q.x format
risk = quad_form(x, C)
# The core problem definition with the Problem class from CVXPY
prob = Problem(Minimize(risk), [sum(x)==1, ret >= req_return, x >= 0])
try:
prob.solve()
print ("Optimal portfolio")
print ("----------------------")
for s in range(len(symbols)):
print (" Investment in {} : {}% of the portfolio".format(symbols[s],round(100*x.value[s],2)))
print ("----------------------")
print ("Exp ret = {}%".format(round(100*ret.value,2)))
print ("Expected risk = {}%".format(round(100*risk.value**0.5,2)))
except:
print ("Error")
When I get to this line of code:
risk = quad_form(x, C)
I get this error.
Traceback (most recent call last):
File "C:\Users\RShuell\AppData\Local\Temp\1\ipykernel_11456\2449242452.py", line 1, in <module>
risk = quad_form(x, C)
File "C:\Users\RShuell\Anaconda3\lib\site-packages\cvxpy\atoms\quad_form.py", line 225, in quad_form
raise Exception("Invalid dimensions for arguments.")
Exception: Invalid dimensions for arguments.
I found the code from this link.
https://www.kdnuggets.com/2019/06/optimization-python-money-risk.html
I am trying to apply that general logic to the data in my specific dataframe.

This seems to work.
import pandas as pd
import numpy as np
data = [{'Month': '2020-01-01', 'Revenue':100, 'Building':'Stadium'},
{'Month': '2020-02-01', 'Revenue':400, 'Building':'Stadium'},
{'Month': '2020-03-01', 'Revenue':500, 'Building':'Stadium'},
{'Month': '2020-04-01', 'Revenue':40, 'Building':'Stadium'},
{'Month': '2020-01-01', 'Revenue':260, 'Building':'Casino'},
{'Month': '2020-02-01', 'Revenue':400, 'Building':'Casino'},
{'Month': '2020-03-01', 'Revenue':900, 'Building':'Casino'},
{'Month': '2020-04-01', 'Revenue':1000, 'Building':'Casino'}]
df = pd.DataFrame(data)
df
table = df.pivot(index='Month', columns='Building', values='Revenue')
table.head()
returns = table.pct_change()
mean_returns = returns.mean()
#r = r.fillna(0)
cov_matrix = returns.cov()
# Get symbols
#cols = df.Building.unique()
symbols = table.columns
# Number of variables
n = 2 #len(symbols)
# The variables vector
x = Variable(n)
# The minimum return
req_return = 0.02
#r = r.reset_index()
# The return
ret = mean_returns.T*x
# The risk in xT.Q.x format
risk = quad_form(x, cov_matrix)
# The core problem definition with the Problem class from CVXPY
prob = Problem(Minimize(risk), [sum(x)==1, x >= 0])
try:
prob.solve()
print ("Optimal portfolio")
print ("----------------------")
for s in range(len(symbols)):
print (" Investment in {} : {}% of the portfolio".format(symbols[s],round(100*x.value[s],2)))
print ("----------------------")
print ("Exp ret = {}%".format(round(100*ret.value,2)))
print ("Expected risk = {}%".format(round(100*risk.value**0.5,2)))
except:
print ("Error")
Also, I came across a similar technique to solve the same problem.
import pandas as pd
import numpy as np
from cvxpy import *
data = [{'Month': '2020-01-01', 'Revenue':5000, 'Building':'Stadium'},
{'Month': '2020-02-01', 'Revenue':5500, 'Building':'Stadium'},
{'Month': '2020-03-01', 'Revenue':6000, 'Building':'Stadium'},
{'Month': '2020-04-01', 'Revenue':6600, 'Building':'Stadium'},
{'Month': '2020-01-01', 'Revenue':10000, 'Building':'Casino'},
{'Month': '2020-02-01', 'Revenue':11000, 'Building':'Casino'},
{'Month': '2020-03-01', 'Revenue':12000, 'Building':'Casino'},
{'Month': '2020-04-01', 'Revenue':13000, 'Building':'Casino'}]
df = pd.DataFrame(data)
df
thelen = 2
table = df.pivot(index='Month', columns='Building', values='Revenue')
table.head()
returns = table.pct_change()
mean_returns = returns.mean()
cov_matrix = returns.cov()
num_portfolios = 10000
risk_free_rate = 0.0178
def portfolio_annualised_performance(weights, mean_returns, cov_matrix):
returns = np.sum(mean_returns*weights ) *252
std = np.sqrt(np.dot(weights.T, np.dot(cov_matrix, weights))) * np.sqrt(252)
return std, returns
def random_portfolios(num_portfolios, mean_returns, cov_matrix, risk_free_rate):
results = np.zeros((3,num_portfolios))
weights_record = []
for i in range(num_portfolios):
weights = np.random.random(thelen)
weights /= np.sum(weights)
weights_record.append(weights)
portfolio_std_dev, portfolio_return = portfolio_annualised_performance(weights, mean_returns, cov_matrix)
results[0,i] = portfolio_std_dev
results[1,i] = portfolio_return
results[2,i] = (portfolio_return - risk_free_rate) / portfolio_std_dev
return results, weights_record
def neg_sharpe_ratio(weights, mean_returns, cov_matrix, risk_free_rate):
p_var, p_ret = portfolio_annualised_performance(weights, mean_returns, cov_matrix)
return -(p_ret - risk_free_rate) / p_var
def max_sharpe_ratio(mean_returns, cov_matrix, risk_free_rate):
num_assets = len(mean_returns)
args = (mean_returns, cov_matrix, risk_free_rate)
constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
bound = (0.0,1.0)
bounds = tuple(bound for asset in range(num_assets))
result = sco.minimize(neg_sharpe_ratio, num_assets*[1./num_assets,], args=args,
method='SLSQP', bounds=bounds, constraints=constraints)
return result
def portfolio_volatility(weights, mean_returns, cov_matrix):
return portfolio_annualised_performance(weights, mean_returns, cov_matrix)[0]
def min_variance(mean_returns, cov_matrix):
num_assets = len(mean_returns)
args = (mean_returns, cov_matrix)
constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
bound = (0.0,1.0)
bounds = tuple(bound for asset in range(num_assets))
result = sco.minimize(portfolio_volatility, num_assets*[1./num_assets,], args=args,
method='SLSQP', bounds=bounds, constraints=constraints)
return result
def efficient_return(mean_returns, cov_matrix, target):
num_assets = len(mean_returns)
args = (mean_returns, cov_matrix)
def portfolio_return(weights):
return portfolio_annualised_performance(weights, mean_returns, cov_matrix)[1]
constraints = ({'type': 'eq', 'fun': lambda x: portfolio_return(x) - target},
{'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
bounds = tuple((0,1) for asset in range(num_assets))
result = sco.minimize(portfolio_volatility, num_assets*[1./num_assets,], args=args, method='SLSQP', bounds=bounds, constraints=constraints)
return result
def efficient_frontier(mean_returns, cov_matrix, returns_range):
efficients = []
for ret in returns_range:
efficients.append(efficient_return(mean_returns, cov_matrix, ret))
return efficients
max_sharpe = max_sharpe_ratio(mean_returns, cov_matrix, risk_free_rate)
sdp, rp = portfolio_annualised_performance(max_sharpe['x'], mean_returns, cov_matrix)
max_sharpe_allocation = pd.DataFrame(max_sharpe.x,index=table.columns,columns=['allocation'])
max_sharpe_allocation.allocation = [round(i*100,2)for i in max_sharpe_allocation.allocation]
max_sharpe_allocation = max_sharpe_allocation.T
min_vol = min_variance(mean_returns, cov_matrix)
sdp_min, rp_min = portfolio_annualised_performance(min_vol['x'], mean_returns, cov_matrix)
min_vol_allocation = pd.DataFrame(min_vol.x,index=table.columns,columns=['allocation'])
min_vol_allocation.allocation = [round(i*100,2)for i in min_vol_allocation.allocation]
min_vol_allocation = min_vol_allocation.T
an_vol = np.std(returns) * np.sqrt(252)
an_rt = mean_returns * 252
for i, txt in enumerate(table.columns):
print(txt,":","annuaised return",round(an_rt[i],2),", annualised volatility:",round(an_vol[i],2))
print("-"*80)

Heiken Ashi candles plotted on graph Binance

So I'm trying to plot Heiken Ashi candles, and then I want to plot them on graph.
My code so far:
def heikin_ashi():
historical_data = client.get_historical_klines(symbol=SYMBOL, interval=TIME_PERIOD, start_str="15 days ago UTC", klines_type=HistoricalKlinesType.FUTURES)
hist_df = pd.DataFrame(historical_data)
hist_df.columns = ['Open Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close Time', 'Quote Asset Volume',
'Number of Trades', 'TB Base Volume', 'TB Quote Volume', 'Ignore']
hist_df['Open Time'] = pd.to_datetime(hist_df['Open Time']/1000, unit='s')
hist_df['Close Time'] = pd.to_datetime(hist_df['Close Time']/1000, unit='s')
df_HA = hist_df
df_HA['Close'] = (hist_df['Open'] + hist_df['High'] + hist_df['Low'] + hist_df['Close']) / 4
# idx = df_HA.index.name
# df_HA.reset_index(inplace=True)
for i in range(0, len(hist_df)):
if i == 0:
df_HA['Open'][i] = ((hist_df['Open'][i] + hist_df['Close'][i]) / 2)
else:
df_HA['Open'][i] = ((hist_df['Open'][i - 1] + hist_df['Close'][i - 1]) / 2)
# if idx:
# df_HA.set_index(idx, inplace=True)
df_HA['High'] = hist_df[['Open', 'Close', 'High']].max(axis=1)
df_HA['Low'] = hist_df[['Open', 'Close', 'Low']].min(axis=1)
print(df_HA)
Error:
result[mask] = op(xrav[mask], y)
TypeError: unsupported operand type(s) for /: 'str' and 'int'
Also I came across this:
import pandas as pd
def heikin_ashi(df):
heikin_ashi_df = pd.DataFrame(index=df.index.values, columns=['open', 'high', 'low', 'close'])
heikin_ashi_df['close'] = (df['open'] + df['high'] + df['low'] + df['close']) / 4
for i in range(len(df)):
if i == 0:
heikin_ashi_df.iat[0, 0] = df['open'].iloc[0]
else:
heikin_ashi_df.iat[i, 0] = (heikin_ashi_df.iat[i-1, 0] + heikin_ashi_df.iat[i-1, 3]) / 2
heikin_ashi_df['high'] = heikin_ashi_df.loc[:, ['open', 'close']].join(df['high']).max(axis=1)
heikin_ashi_df['low'] = heikin_ashi_df.loc[:, ['open', 'close']].join(df['low']).min(axis=1)
return heikin_ashi_df
How do I use the above code with my data? I'm a novice, so I'm confused. I'd appreciate it if someone could provide me with a proper way to do this.
Link to the source: https://github.com/emreturan/heikin-ashi/blob/master/heikin_ashi.py
I need to plot this on a graph too. Thanks.

I will use the 'heikin_ashi' code to answer the example of using mplfinance, a popular finance library, for the graph. There are many other libraries available for visualizing investments, so we will use this as a basic form for data acquisition and visualization. A sample of mplfinance can be found here for reference.
import yfinance as yf
import pandas as pd
import mplfinance as mpf
data = yf.download("AAPL", start="2021-07-01", end="2022-01-01", progress=False)
data.columns = ['open', 'high', 'low', 'close', 'adj close', 'volume']
def heikin_ashi(df):
heikin_ashi_df = df.copy()
#heikin_ashi_df = pd.DataFrame(index=df.index.values, columns=['open', 'high', 'low', 'close'])
heikin_ashi_df['close'] = (df['open'] + df['high'] + df['low'] + df['close']) / 4
for i in range(len(df)):
if i == 0:
heikin_ashi_df.iat[0, 0] = df['open'].iloc[0]
else:
heikin_ashi_df.iat[i, 0] = (heikin_ashi_df.iat[i-1, 0] + heikin_ashi_df.iat[i-1, 3]) / 2
heikin_ashi_df['high'] = heikin_ashi_df.loc[:, ['open', 'close']].join(df['high']).max(axis=1)
heikin_ashi_df['low'] = heikin_ashi_df.loc[:, ['open', 'close']].join(df['low']).min(axis=1)
return heikin_ashi_df
df_ha = heikin_ashi(data)
# mpf plotting
mpf.plot(df_ha, type='candle', figratio=(8,4), title='APPL', style='yahoo')

KeyError: color='variable' using plot

I am trying to run the following script. I am getting a KeyError on the function trying to plot stock returns.
It seems to be coming from fig = px.line(grouped_metrics, x="Date Snapshot", y="value", color='variable'). However, it is a valid column in my df. I have tried adding different columns in the color= argument but I get the same error. My three columns are 'variable', 'value', 'Date Snapshot'. Appologies for the block of code:
Data for ref
import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import yfinance as yf
import pandas_market_calendars as mcal
from plotly.offline import init_notebook_mode, plot
init_notebook_mode(connected=True)
def create_market_cal(start, end):
nyse = mcal.get_calendar('NYSE')
schedule = nyse.schedule(stocks_start, stocks_end)
market_cal = mcal.date_range(schedule, frequency='1D')
market_cal = market_cal.tz_localize(None)
market_cal = [i.replace(hour=0) for i in market_cal]
return market_cal
def get_data(stocks, start, end):
def data(ticker):
df = yf.download(ticker, start=start, end=(end + datetime.timedelta(days=1)))
df['symbol'] = ticker
df.index = pd.to_datetime(df.index)
return df
datas = map(data, stocks)
return(pd.concat(datas, keys=stocks, names=['Ticker', 'Date'], sort=True))
def get_benchmark(benchmark, start, end):
benchmark = get_data(benchmark, start, end)
benchmark = benchmark.drop(['symbol'], axis=1)
benchmark.reset_index(inplace=True)
return benchmark
portfolio_df = pd.read_csv('C:\\tmp\\stock_transactions.csv')
portfolio_df['Open date'] = pd.to_datetime(portfolio_df['Open date'])
symbols = portfolio_df.Symbol.unique()
stocks_start = datetime.datetime(2018, 3, 1)
stocks_end = datetime.datetime(2021, 3, 10)
daily_adj_close = get_data(symbols, stocks_start, stocks_end)
daily_adj_close = daily_adj_close[['Close']].reset_index()
daily_benchmark = get_benchmark(['SPY'], stocks_start, stocks_end)
daily_benchmark = daily_benchmark[['Date', 'Close']]
market_cal = create_market_cal(stocks_start, stocks_end)
def position_adjust(daily_positions, sale):
stocks_with_sales = pd.DataFrame()
buys_before_start = daily_positions[daily_positions['Type'] == 'Buy'].sort_values(by='Open date')
for position in buys_before_start[buys_before_start['Symbol'] == sale[1]['Symbol']].iterrows():
if position[1]['Qty'] <= sale[1]['Qty']:
sale[1]['Qty'] -= position[1]['Qty']
position[1]['Qty'] = 0
else:
position[1]['Qty'] -= sale[1]['Qty']
sale[1]['Qty'] -= sale[1]['Qty']
stocks_with_sales = stocks_with_sales.append(position[1])
return stocks_with_sales
def portfolio_start_balance(portfolio, start_date):
positions_before_start = portfolio[portfolio['Open date'] <= start_date]
future_sales = portfolio[(portfolio['Open date'] >= start_date) & (portfolio['Type'] == 'Sell')]
sales = positions_before_start[positions_before_start['Type'] =='Sell'].groupby(['Symbol'])['Qty'].sum()
sales = sales.reset_index()
positions_no_change = positions_before_start[~positions_before_start['Symbol'].isin(sales['Symbol'].unique())]
adj_positions_df = pd.DataFrame()
for sale in sales.iterrows():
adj_positions = position_adjust(positions_before_start, sale)
adj_positions_df = adj_positions_df.append(adj_positions)
adj_positions_df = adj_positions_df.append(positions_no_change)
adj_positions_df = adj_positions_df.append(future_sales)
adj_positions_df = adj_positions_df[adj_positions_df['Qty'] > 0]
return adj_positions_df
active_portfolio = portfolio_start_balance(portfolio_df, stocks_start)
def fifo(daily_positions, sales, date):
sales = sales[sales['Open date'] == date]
daily_positions = daily_positions[daily_positions['Open date'] <= date]
positions_no_change = daily_positions[~daily_positions['Symbol'].isin(sales['Symbol'].unique())]
adj_positions = pd.DataFrame()
for sale in sales.iterrows():
adj_positions = adj_positions.append(position_adjust(daily_positions, sale))
adj_positions = adj_positions.append(positions_no_change)
adj_positions = adj_positions[adj_positions['Qty'] > 0]
return adj_positions
def time_fill(portfolio, market_cal):
sales = portfolio[portfolio['Type'] == 'Sell'].groupby(['Symbol','Open date'])['Qty'].sum()
sales = sales.reset_index()
per_day_balance = []
for date in market_cal:
if (sales['Open date'] == date).any():
portfolio = fifo(portfolio, sales, date)
daily_positions = portfolio[portfolio['Open date'] <= date]
daily_positions = daily_positions[daily_positions['Type'] == 'Buy']
daily_positions['Date Snapshot'] = date
per_day_balance.append(daily_positions)
return per_day_balance
positions_per_day = time_fill(active_portfolio, market_cal)
def modified_cost_per_share(portfolio, adj_close, start_date):
df = pd.merge(portfolio, adj_close, left_on=['Date Snapshot', 'Symbol'],
right_on=['Date', 'Ticker'], how='left')
df.rename(columns={'Close': 'Symbol Adj Close'}, inplace=True)
df['Adj cost daily'] = df['Symbol Adj Close'] * df['Qty']
df = df.drop(['Ticker', 'Date'], axis=1)
return df
def benchmark_portfolio_calcs(portfolio, benchmark):
portfolio = pd.merge(portfolio, benchmark, left_on=['Date Snapshot'],
right_on=['Date'], how='left')
portfolio = portfolio.drop(['Date'], axis=1)
portfolio.rename(columns={'Close': 'Benchmark Close'}, inplace=True)
benchmark_max = benchmark[benchmark['Date'] == benchmark['Date'].max()]
portfolio['Benchmark End Date Close'] = portfolio.apply(lambda x: benchmark_max['Close'], axis=1)
benchmark_min = benchmark[benchmark['Date'] == benchmark['Date'].min()]
portfolio['Benchmark Start Date Close'] = portfolio.apply(lambda x: benchmark_min['Close'], axis=1)
return portfolio
def portfolio_end_of_year_stats(portfolio, adj_close_end):
adj_close_end = adj_close_end[adj_close_end['Date'] == adj_close_end['Date'].max()]
portfolio_end_data = pd.merge(portfolio, adj_close_end, left_on='Symbol',
right_on='Ticker')
portfolio_end_data.rename(columns={'Close': 'Ticker End Date Close'}, inplace=True)
portfolio_end_data = portfolio_end_data.drop(['Ticker', 'Date'], axis=1)
return portfolio_end_data
def portfolio_start_of_year_stats(portfolio, adj_close_start):
adj_close_start = adj_close_start[adj_close_start['Date'] == adj_close_start['Date'].min()]
portfolio_start = pd.merge(portfolio, adj_close_start[['Ticker', 'Close', 'Date']],
left_on='Symbol', right_on='Ticker')
portfolio_start.rename(columns={'Close': 'Ticker Start Date Close'}, inplace=True)
portfolio_start['Adj cost per share'] = np.where(portfolio_start['Open date'] <= portfolio_start['Date'],
portfolio_start['Ticker Start Date Close'],
portfolio_start['Adj cost per share'])
portfolio_start['Adj cost'] = portfolio_start['Adj cost per share'] * portfolio_start['Qty']
portfolio_start = portfolio_start.drop(['Ticker', 'Date'], axis=1)
portfolio_start['Equiv Benchmark Shares'] = portfolio_start['Adj cost'] / portfolio_start['Benchmark Start Date Close']
portfolio_start['Benchmark Start Date Cost'] = portfolio_start['Equiv Benchmark Shares'] * portfolio_start['Benchmark Start Date Close']
return portfolio_start
def calc_returns(portfolio):
portfolio['Benchmark Return'] = portfolio['Benchmark Close'] / portfolio['Benchmark Start Date Close'] - 1
portfolio['Ticker Return'] = portfolio['Symbol Adj Close'] / portfolio['Adj cost per share'] - 1
portfolio['Ticker Share Value'] = portfolio['Qty'] * portfolio['Symbol Adj Close']
portfolio['Benchmark Share Value'] = portfolio['Equiv Benchmark Shares'] * portfolio['Benchmark Close']
portfolio['Abs Value Compare'] = portfolio['Ticker Share Value'] - portfolio['Benchmark Start Date Cost']
portfolio['Abs Value Return'] = portfolio['Abs Value Compare']/portfolio['Benchmark Start Date Cost']
portfolio['Stock Gain / (Loss)'] = portfolio['Ticker Share Value'] - portfolio['Adj cost']
portfolio['Benchmark Gain / (Loss)'] = portfolio['Benchmark Share Value'] - portfolio['Adj cost']
portfolio['Abs. Return Compare'] = portfolio['Ticker Return'] - portfolio['Benchmark Return']
return portfolio
def per_day_portfolio_calcs(per_day_holdings, daily_benchmark, daily_adj_close, stocks_start):
df = pd.concat(per_day_holdings, sort=True)
mcps = modified_cost_per_share(df, daily_adj_close, stocks_start)
bpc = benchmark_portfolio_calcs(mcps, daily_benchmark)
pes = portfolio_end_of_year_stats(bpc, daily_adj_close)
pss = portfolio_start_of_year_stats(pes, daily_adj_close)
returns = calc_returns(pss)
return returns
combined_df = per_day_portfolio_calcs(positions_per_day, daily_benchmark, daily_adj_close, stocks_start)
def line(df, val_1, val_2):
grouped_metrics = combined_df.groupby(['Date Snapshot'])[[val_1, val_2]].sum().reset_index()
grouped_metrics = pd.melt(grouped_metrics, id_vars=['Date Snapshot'],
value_vars=[val_1, val_2])
fig = px.line(grouped_metrics, x="Date Snapshot", y="value",
color='variable')
plot(fig)
line(combined_df, 'Stock Gain / (Loss)', 'Benchmark Gain / (Loss)')
def line_facets(df, val_1, val_2):
grouped_metrics = combined_df.groupby(['Symbol', 'Date Snapshot'])[[val_1, val_2]].sum().reset_index()
grouped_metrics = pd.melt(grouped_metrics, id_vars=['Symbol', 'Date Snapshot'],
value_vars=[val_1, val_2])
fig = px.line(grouped_metrics, x="Date Snapshot", y="value",
color='variable', facet_col="Symbol", facet_col_wrap=5)
plot(fig)
line_facets(combined_df, 'Ticker Return', 'Benchmark Return')
The above throws the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-4-337cc930bd36> in <module>
183
184
--> 185 line(combined_df, 'Stock Gain / (Loss)', 'Benchmark Gain / (Loss)')
186
187
<ipython-input-4-337cc930bd36> in line(df, val_1, val_2)
179 value_vars=[val_1, val_2])
180 fig = px.line(grouped_metrics, x="Date Snapshot", y="value",
--> 181 color='variable')
182 plot(fig)
183
~\anaconda3\lib\site-packages\plotly\express\_chart_types.py in line(data_frame, x, y, line_group, color, line_dash, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, line_dash_sequence, line_dash_map, log_x, log_y, range_x, range_y, line_shape, render_mode, title, template, width, height)
250 a polyline mark in 2D space.
251 """
--> 252 return make_figure(args=locals(), constructor=go.Scatter)
253
254
~\anaconda3\lib\site-packages\plotly\express\_core.py in make_figure(args, constructor, trace_patch, layout_patch)
1887 prefix = get_label(args, args["facet_row"]) + "="
1888 row_labels = [prefix + str(s) for s in sorted_group_values[m.grouper]]
-> 1889 for val in sorted_group_values[m.grouper]:
1890 if val not in m.val_map:
1891 m.val_map[val] = m.sequence[len(m.val_map) % len(m.sequence)]
KeyError: 'variable'

In case someone comes across this issue: I had the same situation, in my case the error message was misleading: root cause was that the dataframe in px.line() was emtpy (no rows).

saving dataframe to CSV in python 2

I am attempting to save a dataframe to Csv. When I print the dataframe it produces the output Im looking for, but when I save the dataframe to csv I only get the last line of the dataframe saved to the csv file.. what I have attempted so far.....
Index_tickers = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\Index_tickers.csv')
Ticker = Index_tickers.ticker
for ticker in Index_tickers.ticker:
index_data = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\' + ticker + '_1.csv')
mkt_data = index_data[['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']]
numRow = 2
while numRow < endRow:
dOpen0 = mkt_data.ix[numRow, 'Open']
dHigh0 = mkt_data.ix[numRow, 'High']
dLow0 = mkt_data.ix[numRow, 'Low']
dClose0 = mkt_data.ix[numRow, 'Close']
dDate0 = mkt_data.ix[numRow, 'Date']
dTime0 = mkt_data.ix[numRow, 'Time']
dTicker0 = index_data.ix[numRow, 'Ticker']
dHaClose0 = (dOpen0 + dHigh0 + dLow0 + dClose0) / 4
dClose1 = mkt_data.ix[numRow -2 , 'Close']
dOpen1 = mkt_data.ix[numRow -2 , 'Open']
dHaOpen0 = (dClose1 + dOpen1) / 2
dHaHigh0 = max(dHigh0, dHaOpen0, dHaClose0)
dHaLow0 = min(dLow0, dHaOpen0, dHaClose0)
dHaGreen0 = dHaClose0 > dHaOpen0
dHaRed0 = dHaClose0 < dHaOpen0
dNumRow = numRow
numRow = numRow + 1
df = pd.DataFrame({'numRow' : pd.Series(dNumRow), 'Time' : pd.Series(dTime0), 'Date' : pd.Series(dDate0), 'Ticker' : pd.Series(dTicker0), 'Open0' : pd.Series(dOpen0), 'High0' : pd.Series(dHigh0), 'Low0' : pd.Series(dLow0), 'Close0' : pd.Series(dClose0)})
#print df
df.to_csv('C:\Users\\ME\\Dropbox\\MktData\HaDetail.csv')
any help hugely appreciated. Im new to python and learning on the job..

You are overwriting your csv on each iteration because the default mode is 'w' which will overwrite if it exists, additionally you are writing out your header and you only need to do this on the first iteration so I would do the following:
Index_tickers = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\Index_tickers.csv')
Ticker = Index_tickers.ticker
writeHeader = True
for ticker in Index_tickers.ticker:
index_data = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\' + ticker + '_1.csv')
mkt_data = index_data[['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']]
numRow = 2
while numRow < endRow:
dOpen0 = mkt_data.ix[numRow, 'Open']
dHigh0 = mkt_data.ix[numRow, 'High']
dLow0 = mkt_data.ix[numRow, 'Low']
dClose0 = mkt_data.ix[numRow, 'Close']
dDate0 = mkt_data.ix[numRow, 'Date']
dTime0 = mkt_data.ix[numRow, 'Time']
dTicker0 = index_data.ix[numRow, 'Ticker']
dHaClose0 = (dOpen0 + dHigh0 + dLow0 + dClose0) / 4
dClose1 = mkt_data.ix[numRow -2 , 'Close']
dOpen1 = mkt_data.ix[numRow -2 , 'Open']
dHaOpen0 = (dClose1 + dOpen1) / 2
dHaHigh0 = max(dHigh0, dHaOpen0, dHaClose0)
dHaLow0 = min(dLow0, dHaOpen0, dHaClose0)
dHaGreen0 = dHaClose0 > dHaOpen0
dHaRed0 = dHaClose0 < dHaOpen0
dNumRow = numRow
numRow = numRow + 1
df = pd.DataFrame({'numRow' : pd.Series(dNumRow), 'Time' : pd.Series(dTime0), 'Date' : pd.Series(dDate0), 'Ticker' : pd.Series(dTicker0), 'Open0' : pd.Series(dOpen0), 'High0' : pd.Series(dHigh0), 'Low0' : pd.Series(dLow0), 'Close0' : pd.Series(dClose0)})
#print df
if writeHeader:
df.to_csv('C:\Users\\ME\\Dropbox\\MktData\HaDetail.csv')
writeHeader = False
else:
df.to_csv('C:\Users\\ME\\Dropbox\\MktData\HaDetail.csv', header=False, mode='a')
So we only write the header on first iteration and then for each subsequent iteration change the mode to 'a' so it appends to the file, see the docs

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why group by is too slow in large data - python

Related

Python Excel Calculations

How can we find an optimal point between minimizing expenses and maximizing revenues?

Heiken Ashi candles plotted on graph Binance

KeyError: color='variable' using plot

saving dataframe to CSV in python 2

Categories

Resources