Python Excel Calculations - python

I am not getting correct calculations for 3 columns I am trying to write on a data sheet for specific date and time in time series data. I want to calculate difference between various times and the closing price time. I am having problem for some reason I can't get correct output for the calculations.
This is the output from this code.
import pandas as pd
import os
import numpy as np
from openpyxl import Workbook
# Read the data into a Pandas DataFrame
directory_path = "C:/Users/bean/Desktop/_L"
os.chdir(directory_path)
book = Workbook()
book.remove(book.active) # remove the first sheet
for file in os.listdir(directory_path):
if file.endswith(".csv"):
file_path = os.path.join(directory_path, file)
df = pd.read_csv(file_path)
# Create a new DataFrame for each file
df_diff = df[['Date', 'CET', 'NA', 'UTC', 'Name', 'BOLLBU', 'BOLLBM', 'BOLLBL',
'VWAP', 'VWAPSD1U', 'VWAPSD1L', 'VWAPSD2U', 'VWAPSD2L', 'ATR', 'ATRMA']]
df['Date'] = pd.to_datetime(df['Date'])
df['CET'] = pd.to_datetime(df['Date'])
df['UTC'] = pd.to_datetime(df['Date'])
df['NA'] = pd.to_datetime(df['Date'])
df_diff['Date'] = pd.to_datetime(df['Date'])
df_diff['CET'] = pd.to_datetime(df['CET'])
df_diff['UTC'] = pd.to_datetime(df['UTC'])
df_diff['NA'] = pd.to_datetime(df['NA'])
df_diff['Open'] = df['Open']
df_diff['High'] = df['High']
df_diff['Low'] = df['Low']
df_diff['Close'] = df['Close']
# Calculate the differences and add them as new columns
df_diff['Open Diff'] = (df['Open'].shift(-1) -
df['Open']) / df['Open'] * 100
df_diff['High Diff'] = (df['High'].shift(-1) -
df['High']) / df['High'] * 100
df_diff['Low Diff'] = (df['Low'].shift(-1) -
df['Low']) / df['Low'] * 100
df_diff['Close Diff'] = (
df['Close'].shift(-1) - df['Close']) / df['Close'] * 100
df_1635 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 35)].sort_values(by='Date', ascending=False)
df_1625 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 25)].sort_values(by='Date', ascending=False)
df_1620 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 20)].sort_values(by='Date', ascending=False)
df_1615 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 15)].sort_values(by='Date', ascending=False)
df_1610 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 10)].sort_values(by='Date', ascending=False)
df_1605 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 5)].sort_values(by='Date', ascending=False)
df_1600 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1545 = df[(df['Date'].dt.hour == 15) & (
df['Date'].dt.minute == 45)].sort_values(by='Date', ascending=False)
df_1530 = df[(df['Date'].dt.hour == 15) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_1500 = df[(df['Date'].dt.hour == 15) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1445 = df[(df['Date'].dt.hour == 14) & (
df['Date'].dt.minute == 45)].sort_values(by='Date', ascending=False)
df_1430 = df[(df['Date'].dt.hour == 14) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_1400 = df[(df['Date'].dt.hour == 14) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1330 = df[(df['Date'].dt.hour == 13) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_1300 = df[(df['Date'].dt.hour == 13) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
df_1230 = df[(df['Date'].dt.hour == 12) & (
df['Date'].dt.minute == 30)].sort_values(by='Date', ascending=False)
df_0800 = df[(df['Date'].dt.hour == 8) & (
df['Date'].dt.minute == 0)].sort_values(by='Date', ascending=False)
# Calculate difference between Close price of df_1635 and other DataFrames
df_diff_1635_1625 = df_1635['Close'] - df_1625['Close']
df_diff_1635_1620 = df_1635['Close'].subtract(df_1620['Close'])
df_diff_1635_1615 = df_1635['Close'].subtract(df_1615['Close'])
df_diff_1635_1610 = df_1635['Close'].subtract(df_1610['Close'])
df_diff_1635_1605 = df_1635['Close'].subtract(df_1605['Close'])
df_diff_1635_1600 = df_1635['Close'].subtract(df_1600['Close'])
df_diff_1635_1545 = df_1635['Close'].subtract(df_1545['Close'])
df_diff_1635_1530 = df_1635['Close'].subtract(df_1530['Close'])
df_diff_1635_1500 = df_1635['Close'].subtract(df_1500['Close'])
df_diff_1635_1445 = df_1635['Close'].subtract(df_1445['Close'])
df_diff_1635_1430 = df_1635['Close'].subtract(df_1430['Close'])
df_diff_1635_1400 = df_1635['Close'].subtract(df_1400['Close'])
df_diff_1635_1330 = df_1635['Close'].subtract(df_1330['Close'])
df_diff_1635_1300 = df_1635['Close'].subtract(df_1300['Close'])
df_diff_1635_1230 = df_1635['Close'].subtract(df_1230['Close'])
df_diff_1635_0800 = df_1635['Close'].subtract(df_0800['Close'])
print(df_diff_1635_1625)
# Add Difference, Percent_Diff, and U/D columns to each DataFrame
df_1635['Difference'] = df_1635['Close'].subtract(
df_1635['Close'].shift())
df_1635['Percent_Diff'] = (df_1635['Difference'] /
df_1635['Close']) * 100
df_1635['U/D'] = np.where(df_1635['Difference'] > 0, 'U', 'D')
df_1625['Difference'] = df_diff_1635_1625
df_1625['Percent_Diff'] = (df_diff_1635_1625 / df_1635['Close']) * 100
df_1625['U/D'] = np.where(df_1625['Percent_Diff'] > 0, 'U', 'D')
print(df_1625.dtypes)
df_1620['Difference'] = df_diff_1635_1620
df_1620['Percent_Diff'] = (df_diff_1635_1620 / df_1635['Close']) * 100
df_1620['U/D'] = np.where(df_1620['Percent_Diff'] > 0, 'U', 'D')
df_1615['Difference'] = df_diff_1635_1615
df_1615['Percent_Diff'] = (df_diff_1635_1615 / df_1635['Close']) * 100
df_1615['U/D'] = np.where(df_1615['Percent_Diff'] > 0, 'U', 'D')
df_1610['Difference'] = df_diff_1635_1610
df_1610['Percent_Diff'] = (df_diff_1635_1610 / df_1635['Close']) * 100
df_1610['U/D'] = np.where(df_1610['Percent_Diff'] > 0, 'U', 'D')
df_1605['Difference'] = df_diff_1635_1605
df_1605['Percent_Diff'] = (df_diff_1635_1605 / df_1635['Close']) * 100
df_1605['U/D'] = np.where(df_1605['Percent_Diff'] > 0, 'U', 'D')
df_1600['Difference'] = df_diff_1635_1600
df_1600['Percent_Diff'] = (df_diff_1635_1600 / df_1635['Close']) * 100
df_1600['U/D'] = np.where(df_1600['Percent_Diff'] > 0, 'U', 'D')
df_1545['Difference'] = df_diff_1635_1545
df_1545['Percent_Diff'] = (df_diff_1635_1545 / df_1635['Close']) * 100
df_1545['U/D'] = np.where(df_1545['Percent_Diff'] > 0, 'U', 'D')
df_1530['Percent_Diff'] = (df_diff_1635_1530 / df_1635['Close']) * 100
df_1530['U/D'] = np.where(df_1530['Percent_Diff'] > 0, 'U', 'D')
df_1500['Difference'] = df_diff_1635_1500
df_1500['Percent_Diff'] = (df_diff_1635_1500 / df_1635['Close']) * 100
df_1500['U/D'] = np.where(df_1500['Percent_Diff'] > 0, 'U', 'D')
df_1445['Difference'] = df_diff_1635_1445
df_1445['Percent_Diff'] = (df_diff_1635_1445 / df_1635['Close']) * 100
df_1445['U/D'] = np.where(df_1445['Percent_Diff'] > 0, 'U', 'D')
df_1430['Difference'] = df_diff_1635_1430
df_1430['Percent_Diff'] = (df_diff_1635_1430 / df_1635['Close']) * 100
df_1430['U/D'] = np.where(df_1430['Percent_Diff'] > 0, 'U', 'D')
df_1400['Difference'] = df_diff_1635_1400
df_1400['Percent_Diff'] = (df_diff_1635_1400 / df_1635['Close']) * 100
df_1400['U/D'] = np.where(df_1400['Percent_Diff'] > 0, 'U', 'D')
df_1330['Difference'] = df_diff_1635_1330
df_1330['Percent_Diff'] = (df_diff_1635_1330 / df_1635['Close']) * 100
df_1330['U/D'] = np.where(df_1330['Percent_Diff'] > 0, 'U', 'D')
df_1300['Difference'] = df_diff_1635_1300
df_1300['Percent_Diff'] = (df_diff_1635_1300 / df_1635['Close']) * 100
df_1300['U/D'] = np.where(df_1300['Percent_Diff'] > 0, 'U', 'D')
df_1230['Difference'] = df_diff_1635_1230
df_1230['Percent_Diff'] = (df_diff_1635_1230 / df_1635['Close']) * 100
df_1230['U/D'] = np.where(df_1230['Percent_Diff'] > 0, 'U', 'D')
df_0800['Difference'] = df_diff_1635_0800
df_0800['Percent_Diff'] = (df_diff_1635_0800 / df_1635['Close']) * 100
df_0800['U/D'] = np.where(df_0800['Percent_Diff'] > 0, 'U', 'D')
df_25 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 25)].sort_values(by='Date', ascending=False)
df_35 = df[(df['Date'].dt.hour == 16) & (
df['Date'].dt.minute == 35)].sort_values(by='Date', ascending=False)
# Concat all results for each time into this sheet.
df_35 = df_35[['Date', 'CET', 'NA', 'UTC', 'Name', 'Open', 'Open Diff', 'High', 'High Diff', 'Low', 'Low Diff',
'Close', 'Close Diff', 'BOLLBU', 'BOLLBM', 'BOLLBL', 'VWAP', 'VWAPSD1U', 'VWAPSD1L', 'VWAPSD2U',
'VWAPSD2L', 'ATR', 'ATRMA']]
df_diff = df_diff.sort_values(by='Date', ascending=False)
df_diff = df_diff[['Date', 'CET', 'NA', 'UTC', 'Name', 'Open', 'Open Diff', 'High', 'High Diff', 'Low', 'Low Diff',
'Close', 'Close Diff', 'BOLLBU', 'BOLLBM', 'BOLLBL', 'VWAP', 'VWAPSD1U', 'VWAPSD1L', 'VWAPSD2U',
'VWAPSD2L', 'ATR', 'ATRMA']]
writer = pd.ExcelWriter(
f'{file.split(".")[0]}.xlsx', engine='openpyxl')
df_diff.to_excel(writer, sheet_name='df_diff', index=False, startrow=0)
df_35.to_excel(writer, sheet_name='Sheet_35min', index=False)
dataframes = [df_1625, df_1620, df_1615, df_1610, df_1605, df_1600, df_1545,
df_1530, df_1500, df_1445, df_1430, df_1400, df_1330, df_1300, df_1230, df_0800]
for i, df in enumerate(dataframes):
df.to_excel(writer, sheet_name=f"df_{i}", index=False)
writer.save()
Essentially, the calculations under the for loop and for df_35 are not coming out properly correctly? How am I doing the operations wrong? The date is datetime but I am accessing column in that specific time value so I don't understand why it doesn't work. I tried various ways here are a few calculation methods I tried that were still wrong,
Neither of these work
df_diff_1635_1625 = df_1635['Close'] - df_1625['Close']
df_diff_1635_1620 = df_1635['Close'].subtract(df_1620['Close'])
all my columns are mostly float64 including close ones except date which is datetime. i check and print the calculation i get nan values so its clearly not processing it.
`

Related

Vectorized Solution to Iterrows

I have 2 dataframes : prediction_df and purchase_info_df. prediction_df contains customer id and prediction date. purchase_info_df contains customer id, purchase amount and purchase date. The dataframes are provided below for a single customer.
customer_id = [1, 1, 1]
prediction_date = ["2022-12-30", "2022-11-30", "2022-10-30"]
purchase_date = ["2022-11-12", "2022-12-01", "2022-09-03"]
purchase_amount = [500, 300, 100]
prediction_df = pd.DataFrame({"id":customer_id, "prediction_date":prediction_date})
purchase_info_df = pd.DataFrame({"id":customer_id,"purchase_date": purchase_date, "purchase_amount": purchase_amount})
prediction_df["prediction_date"] = pd.to_datetime(prediction_df["prediction_date"])
purchase_info_df["purchase_date"] = pd.to_datetime(purchase_info_df["purchase_date"])
My aim is to create features such as: total purchase, mean purchase amount, purchase amount in the last month etc. on the prediction_date. I can do this by the following code, which uses iterrows. This is way too slow when I have over a 100.000 customers. I am looking for a solution to vectorize the operations given in the below code, so that it will be faster.
res = []
for idx, rw in tqdm_notebook(prediction_df.iterrows(), total = prediction_df.shape[0]):
dep_dat = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date)]
dep_sum = dep_dat.purchase_amount.sum()
dep_mean = dep_dat.purchase_amount.mean()
dep_std = dep_dat.purchase_amount.std()
dep_count = dep_dat.purchase_amount.count()
last_15_days = rw.prediction_date - relativedelta(days = 15)
last_30_days = rw.prediction_date - relativedelta(days = 30)
last_45_days = rw.prediction_date - relativedelta(days = 45)
last_60_days = rw.prediction_date - relativedelta(days = 60)
last_15_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_15_days)].purchase_amount.sum()
last_30_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_30_days)].purchase_amount.sum()
last_45_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_45_days)].purchase_amount.sum()
last_60_days_dep_amount = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date <= rw.prediction_date) & (purchase_info_df.purchase_date >= last_60_days)].purchase_amount.sum()
last_15_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_15_days)].purchase_amount.count()
last_30_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_30_days)].purchase_amount.count()
last_45_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_45_days)].purchase_amount.count()
last_60_days_dep_count = purchase_info_df[(purchase_info_df.id == rw.id) & (purchase_info_df.purchase_date<= rw.prediction_date) & (purchase_info_df.purchase_date >= last_60_days)].purchase_amount.count()
res.append([rw.id,
rw.prediction_date,
dep_sum,
dep_mean,
dep_count,
last_15_days_dep_amount,
last_30_days_dep_amount,
last_45_days_dep_amount,
last_60_days_dep_amount,
last_15_days_dep_count,
last_30_days_dep_count,
last_45_days_dep_count,
last_60_days_dep_count])
output = pd.DataFrame(res, columns = ["id",
"prediction_date",
"amount_sum",
"amount_mean",
"purchase_count",
"last_15_days_dep_amount",
"last_30_days_dep_amount",
"last_45_days_dep_amount",
"last_60_days_dep_amount",
"last_15_days_dep_count",
"last_30_days_dep_count",
"last_45_days_dep_count",
"last_60_days_dep_count"])
Try this:
# Merge Prediction and Purchase Info for each customer, keeping only rows where
# purchase_date <= prediction_date.
# Depends on big the two frames are, your computer may run out of memory.
df = (
prediction_df.merge(purchase_info_df, on="id")
.query("purchase_date <= prediction_date")
)
cols = ["id", "prediction_date"]
# Each each customer on each prediction date, calculate some stats
stat0 = df.groupby(cols)["purchase_amount"].agg(["sum", "mean", "count"])
# Now calculate the stats within some time windows
stats = {}
for t in pd.to_timedelta([15, 30, 45, 60], unit="d"):
stats[f"last_{t.days}_days"] = (
df[df["purchase_date"] >= df["prediction_date"] - t]
.groupby(cols)["purchase_amount"]
.agg(["sum", "count"])
)
# Combine the individual stats for the final result
result = (
pd.concat([stat0, *stats.values()], keys=["all", *stats.keys()], axis=1)
.fillna(0)
)

KeyError: color='variable' using plot

I am trying to run the following script. I am getting a KeyError on the function trying to plot stock returns.
It seems to be coming from fig = px.line(grouped_metrics, x="Date Snapshot", y="value", color='variable'). However, it is a valid column in my df. I have tried adding different columns in the color= argument but I get the same error. My three columns are 'variable', 'value', 'Date Snapshot'. Appologies for the block of code:
Data for ref
import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import yfinance as yf
import pandas_market_calendars as mcal
from plotly.offline import init_notebook_mode, plot
init_notebook_mode(connected=True)
def create_market_cal(start, end):
nyse = mcal.get_calendar('NYSE')
schedule = nyse.schedule(stocks_start, stocks_end)
market_cal = mcal.date_range(schedule, frequency='1D')
market_cal = market_cal.tz_localize(None)
market_cal = [i.replace(hour=0) for i in market_cal]
return market_cal
def get_data(stocks, start, end):
def data(ticker):
df = yf.download(ticker, start=start, end=(end + datetime.timedelta(days=1)))
df['symbol'] = ticker
df.index = pd.to_datetime(df.index)
return df
datas = map(data, stocks)
return(pd.concat(datas, keys=stocks, names=['Ticker', 'Date'], sort=True))
def get_benchmark(benchmark, start, end):
benchmark = get_data(benchmark, start, end)
benchmark = benchmark.drop(['symbol'], axis=1)
benchmark.reset_index(inplace=True)
return benchmark
portfolio_df = pd.read_csv('C:\\tmp\\stock_transactions.csv')
portfolio_df['Open date'] = pd.to_datetime(portfolio_df['Open date'])
symbols = portfolio_df.Symbol.unique()
stocks_start = datetime.datetime(2018, 3, 1)
stocks_end = datetime.datetime(2021, 3, 10)
daily_adj_close = get_data(symbols, stocks_start, stocks_end)
daily_adj_close = daily_adj_close[['Close']].reset_index()
daily_benchmark = get_benchmark(['SPY'], stocks_start, stocks_end)
daily_benchmark = daily_benchmark[['Date', 'Close']]
market_cal = create_market_cal(stocks_start, stocks_end)
def position_adjust(daily_positions, sale):
stocks_with_sales = pd.DataFrame()
buys_before_start = daily_positions[daily_positions['Type'] == 'Buy'].sort_values(by='Open date')
for position in buys_before_start[buys_before_start['Symbol'] == sale[1]['Symbol']].iterrows():
if position[1]['Qty'] <= sale[1]['Qty']:
sale[1]['Qty'] -= position[1]['Qty']
position[1]['Qty'] = 0
else:
position[1]['Qty'] -= sale[1]['Qty']
sale[1]['Qty'] -= sale[1]['Qty']
stocks_with_sales = stocks_with_sales.append(position[1])
return stocks_with_sales
def portfolio_start_balance(portfolio, start_date):
positions_before_start = portfolio[portfolio['Open date'] <= start_date]
future_sales = portfolio[(portfolio['Open date'] >= start_date) & (portfolio['Type'] == 'Sell')]
sales = positions_before_start[positions_before_start['Type'] =='Sell'].groupby(['Symbol'])['Qty'].sum()
sales = sales.reset_index()
positions_no_change = positions_before_start[~positions_before_start['Symbol'].isin(sales['Symbol'].unique())]
adj_positions_df = pd.DataFrame()
for sale in sales.iterrows():
adj_positions = position_adjust(positions_before_start, sale)
adj_positions_df = adj_positions_df.append(adj_positions)
adj_positions_df = adj_positions_df.append(positions_no_change)
adj_positions_df = adj_positions_df.append(future_sales)
adj_positions_df = adj_positions_df[adj_positions_df['Qty'] > 0]
return adj_positions_df
active_portfolio = portfolio_start_balance(portfolio_df, stocks_start)
def fifo(daily_positions, sales, date):
sales = sales[sales['Open date'] == date]
daily_positions = daily_positions[daily_positions['Open date'] <= date]
positions_no_change = daily_positions[~daily_positions['Symbol'].isin(sales['Symbol'].unique())]
adj_positions = pd.DataFrame()
for sale in sales.iterrows():
adj_positions = adj_positions.append(position_adjust(daily_positions, sale))
adj_positions = adj_positions.append(positions_no_change)
adj_positions = adj_positions[adj_positions['Qty'] > 0]
return adj_positions
def time_fill(portfolio, market_cal):
sales = portfolio[portfolio['Type'] == 'Sell'].groupby(['Symbol','Open date'])['Qty'].sum()
sales = sales.reset_index()
per_day_balance = []
for date in market_cal:
if (sales['Open date'] == date).any():
portfolio = fifo(portfolio, sales, date)
daily_positions = portfolio[portfolio['Open date'] <= date]
daily_positions = daily_positions[daily_positions['Type'] == 'Buy']
daily_positions['Date Snapshot'] = date
per_day_balance.append(daily_positions)
return per_day_balance
positions_per_day = time_fill(active_portfolio, market_cal)
def modified_cost_per_share(portfolio, adj_close, start_date):
df = pd.merge(portfolio, adj_close, left_on=['Date Snapshot', 'Symbol'],
right_on=['Date', 'Ticker'], how='left')
df.rename(columns={'Close': 'Symbol Adj Close'}, inplace=True)
df['Adj cost daily'] = df['Symbol Adj Close'] * df['Qty']
df = df.drop(['Ticker', 'Date'], axis=1)
return df
def benchmark_portfolio_calcs(portfolio, benchmark):
portfolio = pd.merge(portfolio, benchmark, left_on=['Date Snapshot'],
right_on=['Date'], how='left')
portfolio = portfolio.drop(['Date'], axis=1)
portfolio.rename(columns={'Close': 'Benchmark Close'}, inplace=True)
benchmark_max = benchmark[benchmark['Date'] == benchmark['Date'].max()]
portfolio['Benchmark End Date Close'] = portfolio.apply(lambda x: benchmark_max['Close'], axis=1)
benchmark_min = benchmark[benchmark['Date'] == benchmark['Date'].min()]
portfolio['Benchmark Start Date Close'] = portfolio.apply(lambda x: benchmark_min['Close'], axis=1)
return portfolio
def portfolio_end_of_year_stats(portfolio, adj_close_end):
adj_close_end = adj_close_end[adj_close_end['Date'] == adj_close_end['Date'].max()]
portfolio_end_data = pd.merge(portfolio, adj_close_end, left_on='Symbol',
right_on='Ticker')
portfolio_end_data.rename(columns={'Close': 'Ticker End Date Close'}, inplace=True)
portfolio_end_data = portfolio_end_data.drop(['Ticker', 'Date'], axis=1)
return portfolio_end_data
def portfolio_start_of_year_stats(portfolio, adj_close_start):
adj_close_start = adj_close_start[adj_close_start['Date'] == adj_close_start['Date'].min()]
portfolio_start = pd.merge(portfolio, adj_close_start[['Ticker', 'Close', 'Date']],
left_on='Symbol', right_on='Ticker')
portfolio_start.rename(columns={'Close': 'Ticker Start Date Close'}, inplace=True)
portfolio_start['Adj cost per share'] = np.where(portfolio_start['Open date'] <= portfolio_start['Date'],
portfolio_start['Ticker Start Date Close'],
portfolio_start['Adj cost per share'])
portfolio_start['Adj cost'] = portfolio_start['Adj cost per share'] * portfolio_start['Qty']
portfolio_start = portfolio_start.drop(['Ticker', 'Date'], axis=1)
portfolio_start['Equiv Benchmark Shares'] = portfolio_start['Adj cost'] / portfolio_start['Benchmark Start Date Close']
portfolio_start['Benchmark Start Date Cost'] = portfolio_start['Equiv Benchmark Shares'] * portfolio_start['Benchmark Start Date Close']
return portfolio_start
def calc_returns(portfolio):
portfolio['Benchmark Return'] = portfolio['Benchmark Close'] / portfolio['Benchmark Start Date Close'] - 1
portfolio['Ticker Return'] = portfolio['Symbol Adj Close'] / portfolio['Adj cost per share'] - 1
portfolio['Ticker Share Value'] = portfolio['Qty'] * portfolio['Symbol Adj Close']
portfolio['Benchmark Share Value'] = portfolio['Equiv Benchmark Shares'] * portfolio['Benchmark Close']
portfolio['Abs Value Compare'] = portfolio['Ticker Share Value'] - portfolio['Benchmark Start Date Cost']
portfolio['Abs Value Return'] = portfolio['Abs Value Compare']/portfolio['Benchmark Start Date Cost']
portfolio['Stock Gain / (Loss)'] = portfolio['Ticker Share Value'] - portfolio['Adj cost']
portfolio['Benchmark Gain / (Loss)'] = portfolio['Benchmark Share Value'] - portfolio['Adj cost']
portfolio['Abs. Return Compare'] = portfolio['Ticker Return'] - portfolio['Benchmark Return']
return portfolio
def per_day_portfolio_calcs(per_day_holdings, daily_benchmark, daily_adj_close, stocks_start):
df = pd.concat(per_day_holdings, sort=True)
mcps = modified_cost_per_share(df, daily_adj_close, stocks_start)
bpc = benchmark_portfolio_calcs(mcps, daily_benchmark)
pes = portfolio_end_of_year_stats(bpc, daily_adj_close)
pss = portfolio_start_of_year_stats(pes, daily_adj_close)
returns = calc_returns(pss)
return returns
combined_df = per_day_portfolio_calcs(positions_per_day, daily_benchmark, daily_adj_close, stocks_start)
def line(df, val_1, val_2):
grouped_metrics = combined_df.groupby(['Date Snapshot'])[[val_1, val_2]].sum().reset_index()
grouped_metrics = pd.melt(grouped_metrics, id_vars=['Date Snapshot'],
value_vars=[val_1, val_2])
fig = px.line(grouped_metrics, x="Date Snapshot", y="value",
color='variable')
plot(fig)
line(combined_df, 'Stock Gain / (Loss)', 'Benchmark Gain / (Loss)')
def line_facets(df, val_1, val_2):
grouped_metrics = combined_df.groupby(['Symbol', 'Date Snapshot'])[[val_1, val_2]].sum().reset_index()
grouped_metrics = pd.melt(grouped_metrics, id_vars=['Symbol', 'Date Snapshot'],
value_vars=[val_1, val_2])
fig = px.line(grouped_metrics, x="Date Snapshot", y="value",
color='variable', facet_col="Symbol", facet_col_wrap=5)
plot(fig)
line_facets(combined_df, 'Ticker Return', 'Benchmark Return')
The above throws the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-4-337cc930bd36> in <module>
183
184
--> 185 line(combined_df, 'Stock Gain / (Loss)', 'Benchmark Gain / (Loss)')
186
187
<ipython-input-4-337cc930bd36> in line(df, val_1, val_2)
179 value_vars=[val_1, val_2])
180 fig = px.line(grouped_metrics, x="Date Snapshot", y="value",
--> 181 color='variable')
182 plot(fig)
183
~\anaconda3\lib\site-packages\plotly\express\_chart_types.py in line(data_frame, x, y, line_group, color, line_dash, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, line_dash_sequence, line_dash_map, log_x, log_y, range_x, range_y, line_shape, render_mode, title, template, width, height)
250 a polyline mark in 2D space.
251 """
--> 252 return make_figure(args=locals(), constructor=go.Scatter)
253
254
~\anaconda3\lib\site-packages\plotly\express\_core.py in make_figure(args, constructor, trace_patch, layout_patch)
1887 prefix = get_label(args, args["facet_row"]) + "="
1888 row_labels = [prefix + str(s) for s in sorted_group_values[m.grouper]]
-> 1889 for val in sorted_group_values[m.grouper]:
1890 if val not in m.val_map:
1891 m.val_map[val] = m.sequence[len(m.val_map) % len(m.sequence)]
KeyError: 'variable'
In case someone comes across this issue: I had the same situation, in my case the error message was misleading: root cause was that the dataframe in px.line() was emtpy (no rows).

Why group by is too slow in large data

I built up this code to group my data by total, month, week, day, hour, and 10 min. There is a problem when I want to run it in large data that has more than 20 million lines, it takes too long and did not finish even after 20 hours. I tried to split the file into small files but it is still slow.
Can you please check my code and is it possible to speed up the process?
def my_agg1(x):
names = {
'Total_in_averge_amount': x['value'].mean(),
'Total_in_max_amount': x['value'].max(),
'Total_in_min_amount': x['value'].min(),
'Total_in_totalamount': x['value'].sum(),
'Total_in_standard_deviation': x['value'].std(),
'Total_in_degree': x['inputs'].sum(),
'duration': x['date'].max() - x['date'].min()}
return pd.Series(names, index=['Total_in_totalamount', 'Total_in_degree', 'Total_in_averge_amount', 'Total_in_max_amount', 'Total_in_min_amount', 'Total_in_standard_deviation', 'duration'])
# Read CSV File
df = pd.read_csv('inputs_header_2.csv', encoding="ISO-8859-1")
# Total
groupTotal = df.groupby(['user']).apply(my_agg1)
c = abs(groupTotal['duration'].astype('timedelta64[D]'))
groupTotal['Total_in_standard_deviation'].fillna('0', inplace=True)
groupTotal['DurationInMin'] = (c * 24) * 60
TotalinDegree = groupTotal['Total_in_degree']
InTotalAmount = groupTotal['Total_in_totalamount']
InTransactionRate = (TotalinDegree / c)
groupTotal['Total_in_transaction_rate'] = round(InTransactionRate, 2)
groupTotal['Total_average_in_speed'] = InTotalAmount / c
Acceleration = (InTotalAmount / c ** 2)
groupTotal['Total_in-acceleration'] = Acceleration
groupTotal.replace(np.inf, 0, inplace=True)
groupTotal = groupTotal.drop(groupTotal.columns[[6]], axis=1)
#groupTotal.to_csv(r'fr1.csv', sep='\t', float_format='%.8f')
print(groupTotal)
groupTotal.to_csv(r'total.csv', sep='\t', float_format='%.8f')
# print(groupTotal)
#################################################################################
# Monthly
monthly_group_result = (df.set_index('date')
.groupby('user')
.resample('M', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
monthly_group_result.columns = monthly_group_result.columns.map('_'.join)
d_monthly = {'value_max': 'Monthly_in_max_amount',
'value_min': 'Monthly_in_min_amount',
'value_sum': 'Monthly_in_totalamount',
'inputs_sum': 'Monthly_in_degree',
'value_std': 'Monthly_in_standard_deviation',
'value_mean': 'Monthly_in_averge_amount'}
monthly_group = monthly_group_result.rename(columns=d_monthly).reset_index()
['Monthly_in_degree'].transform('max') == group['Monthly_in_degree']]
MonthlygroupUser = monthly_group.loc[monthly_group.groupby('user')['Monthly_in_degree'].idxmax()]
MonthlygroupUser['Monthly_in_standard_deviation'].fillna('0', inplace=True)
MonthlyinDegree = MonthlygroupUser['Monthly_in_degree']
MonthlyInTotalAmount = MonthlygroupUser['Monthly_in_totalamount']
Monthly_date = MonthlygroupUser['date'].dt.daysinmonth
MonthlyInTransactionRate = (MonthlyinDegree / Monthly_date)
MonthlygroupUser['Monthly_in_transaction_rate'] = round(MonthlyInTransactionRate, 3)
MonthlygroupUser['Monthly_average_in_speed'] = MonthlyInTotalAmount / Monthly_date
MonthlyAcceleration = (MonthlyInTotalAmount / Monthly_date ** 2)
MonthlygroupUser['Monthly_in-acceleration'] = MonthlyAcceleration
MonthlygroupUser = MonthlygroupUser.drop(MonthlygroupUser.columns[[1, 5]], axis=1)
# Weekly
Weekly_group_result = (df.set_index('date')
.groupby('user')
.resample('7D', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
Weekly_group_result.columns = Weekly_group_result.columns.map('_'.join)
d_weekly = {'value_max': 'Weekly_in_max_amount',
'value_min': 'Weekly_in_min_amount',
'value_sum': 'Weekly_in_totalamount',
'inputs_sum': 'Weekly_in_degree',
'value_std': 'Weekly_in_standard_deviation',
'value_mean': 'Weekly_in_averge_amount'}
Weeklygroup = Weekly_group_result.rename(columns=d_weekly).reset_index()
WeeklygroupUser = Weeklygroup.loc[Weeklygroup.groupby('user')['Weekly_in_degree'].idxmax()]
WeeklygroupUser['Weekly_in_standard_deviation'].fillna('0', inplace=True)
WeeklyinDegree = WeeklygroupUser['Weekly_in_degree']
WeeklyInTotalAmount = WeeklygroupUser['Weekly_in_totalamount']
WeeklyInTransactionRate = (WeeklyinDegree / 7)
WeeklygroupUser['Weekly_in_transaction_rate'] = round(WeeklyInTransactionRate, 3)
WeeklygroupUser['Weekly_average_in_speed'] = WeeklyInTotalAmount / 7
WeeklyAcceleration = (WeeklyInTotalAmount / 7 ** 2)
WeeklygroupUser['Weekly_in-acceleration'] = WeeklyAcceleration
WeeklygroupUser = WeeklygroupUser.drop(WeeklygroupUser.columns[[1, 4]], axis=1)
# daily
Daily_group_result = (df.set_index('date')
.groupby('user')
.resample('D', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
Daily_group_result.columns = Daily_group_result.columns.map('_'.join)
d_daily = {'value_max': 'Daily_in_max_amount',
'value_min': 'Daily_in_min_amount',
'value_sum': 'Daily_in_totalamount',
'inputs_sum': 'Daily_in_degree',
'value_std': 'Daily_in_standard_deviation',
'value_mean': 'Daily_in_averge_amount'}
Dailygroup = Daily_group_result.rename(columns=d_daily).reset_index()
DailygroupUser = Dailygroup.loc[Dailygroup.groupby('user')['Daily_in_degree'].idxmax()]
DailygroupUser['Daily_in_standard_deviation'].fillna('0', inplace=True)
DailyinDegree = DailygroupUser['Daily_in_degree']
DailyInTotalAmount = DailygroupUser['Daily_in_totalamount']
DailyInTransactionRate = (DailyinDegree / 1)
DailygroupUser['Daily_in_transaction_rate'] = round(DailyInTransactionRate, 3)
DailygroupUser['Daily_average_in_speed'] = DailyInTotalAmount / 1
DailyAcceleration = (DailyInTotalAmount / 1 ** 2)
DailygroupUser['Daily_in-acceleration'] = DailyAcceleration
DailygroupUser = DailygroupUser.drop(DailygroupUser.columns[[1, 4]], axis=1)
# Hourly
# group all users by hour
Hourly_group_result = (df.set_index('date')
.groupby('user')
.resample('H', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
Hourly_group_result.columns = Hourly_group_result.columns.map('_'.join)
d_hourly = {'value_max': 'Hourly_in_max_amount',
'value_min': 'Hourly_in_min_amount',
'value_sum': 'Hourly_in_totalamount',
'inputs_sum': 'Hourly_in_degree',
'value_std': 'Hourly_in_standard_deviation',
'value_mean': 'Hourly_in_averge_amount'}
Hourlygroup = Hourly_group_result.rename(columns=d_hourly).reset_index()
HourlygroupUser = Hourlygroup.loc[Hourlygroup.groupby('user')['Hourly_in_degree'].idxmax()]
)
HourlygroupUser['Hourly_in_standard_deviation'].fillna('0', inplace=True)
HourlyinDegree = HourlygroupUser['Hourly_in_degree']
HourlyInTotalAmount = HourlygroupUser['Hourly_in_totalamount']
HourlyInTransactionRate = (HourlyinDegree / (1 / 24))
HourlygroupUser['Hourly_in_transaction_rate'] = round(HourlyInTransactionRate, 3)
HourlygroupUser['Hourly_average_in_speed'] = HourlyInTotalAmount / 1
HourlyAcceleration = (HourlyInTotalAmount / (1 / 24) ** 2)
HourlygroupUser['Hourly_in-acceleration'] = HourlyAcceleration
HourlygroupUser = HourlygroupUser.drop(HourlygroupUser.columns[[1, 4]], axis=1)
# group all users by 10Min
TenMin_group_result = (df.set_index('date')
.groupby('user')
.resample('10Min', convention='start')
.agg({'value': ['mean', 'max', 'min', 'sum', 'std'],
'inputs': ['sum']}))
TenMin_group_result.columns = TenMin_group_result.columns.map('_'.join)
d_10Min = {'value_max': 'TenMin_in_max_amount',
'value_min': 'TenMin_in_min_amount',
'value_sum': 'TenMin_in_totalamount',
'inputs_sum': 'TenMin_in_degree',
'value_std': 'TenMin_in_standard_deviation',
'value_mean': 'TenMin_in_averge_amount'}
TenMingroup = TenMin_group_result.rename(columns=d_10Min).reset_index()
TenMingroupUser = TenMingroup.loc[TenMingroup.groupby('user')['TenMin_in_degree'].idxmax()]
TenMingroupUser['TenMin_in_standard_deviation'].fillna('0', inplace=True)
TenMininDegree = TenMingroupUser['TenMin_in_degree']
TenMinInTotalAmount = TenMingroupUser['TenMin_in_totalamount']
TenMinInTransactionRate = (TenMininDegree / (1 / 24 / 6))
TenMingroupUser['TenMin_in_transaction_rate'] = round(TenMinInTransactionRate, 3)
TenMingroupUser['TenMin_average_in_speed'] = TenMinInTotalAmount / (1 / 24 / 6)
TenMinAcceleration = ((TenMinInTotalAmount / (1 / 24 / 6) ** 2)) - (1 / 24 / 6)
TenMingroupUser['TenMin_in-acceleration'] = TenMinAcceleration
TenMingroupUser = TenMingroupUser.drop(TenMingroupUser.columns[[1, 4]], axis=1)
#TenMingroupUser.to_csv(r'fr1.csv', sep='\t', float_format='%.8f')
FinalTotal = pd.merge(groupTotal, MonthlygroupUser, on='user').merge(WeeklygroupUser, on='user').merge(DailygroupUser, on='user').merge(HourlygroupUser, on='user').merge(TenMingroupUser, on='user')
FinalTotal.to_csv(r'Final_Inputs_2.csv', sep='\t', float_format='%.8f', index=False)
Thank you for any help
Regards,
Khaled

'numpy.ndarray' object is not callable in the Ipython console

I am trying to calculate cpmx, hmx, smpx, tmpx and smvx by doing simple interpolation after loading the data from excel into pandas dataframe.
While calling the function with cpmx=absmatdata(1,0,0,0,44.011,100) I see:
'numpy.ndarray' object is not callable
Any idea how to go about this?
Here is my code:
import numpy as np
import pandas as pd
def absmatdata(a,b,c,d,material,tmp_ref):
material_map = {2.016: 'H2', 28.016: 'N2', 32.000: 'O2', 32.065: 'S',
18.016: 'H2O', 64.065: 'SO2', 12.001: 'C Graphite',
28.011: 'CO', 44.011: 'CO2', 16.043: 'CH4', 30.070: 'C2H6',
44.097: 'C3H8', 58.124: 'C4H10'}
if material in material_map:
df = pd.read_excel('F:\MAschinenbau\Bachelorarbeit\ABSMAT.xlsx',sheet_name=material_map[material])
else:
print('No data for this material available')
df = [list(np.arange(0,1100,100)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1)),list(np.arange(0,11,1))]
tmp = df.values[:,0]
cpm = df.values[:,1]
hm = df.values[:,2]
smp = df.values[:,3]
smv = df.values[:,4]
tn = np.size(df)
tmp0 = tmp_ref
tmpx = a
cpmx = 0
hmx = b
smpx = c
smvx = d
if a==0 and b==0 and c==0 and d==0:
print('All values are zero')
elif a!=0 and b==0 and c==0 and d==0:
print('T interpolation')
for i in range(0,tn-1):
if tmpx > tmp(i) and tmpx <= tmp(i+1):
int_fak = (tmpx-tmp(i))/(tmp(i+1)-tmp(i))
cpmx = cpm(i) + int_fak*(cpm(i+1)-cpm(i))
hmx = hm(i) + int_fak*(hm(i+1)-hm(i))
smpx = smp(i) + int_fak*(smp(i+1)-smp(i))
smvx = smv(i) + int_fak*(smv(i+1)-smv(i))
return tmpx, cpmx, hmx, smpx, smvx
You set df to DataFrame
You set tmp = df.values[:,0]
You have numpy.ndarry at tmp
You have to get its items with [] not with ()
Your loop part
if tmpx > tmp(i) and tmpx <= tmp(i+1):
int_fak = (tmpx-tmp(i))/(tmp(i+1)-tmp(i))
cpmx = cpm(i) + int_fak*(cpm(i+1)-cpm(i))
hmx = hm(i) + int_fak*(hm(i+1)-hm(i))
smpx = smp(i) + int_fak*(smp(i+1)-smp(i))
smvx = smv(i) + int_fak*(smv(i+1)-smv(i))
Should change with
if tmpx > tmp[i] and tmpx <= tmp[i+1]:
int_fak = (tmpx-tmp[i])/(tmp[i+1]-tmp[i])
cpmx = cpm[i] + int_fak*(cpm[i+1]-cpm[i])
hmx = hm[i] + int_fak*(hm[i+1]-hm(i))
smpx = smp[i] + int_fak*(smp[i+1]-smp[i])
smvx = smv[i] + int_fak*(smv[i+1]-smv[i])
Also you need to change your tn to
tn = np.size(df.values[:,0])

saving dataframe to CSV in python 2

I am attempting to save a dataframe to Csv. When I print the dataframe it produces the output Im looking for, but when I save the dataframe to csv I only get the last line of the dataframe saved to the csv file.. what I have attempted so far.....
Index_tickers = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\Index_tickers.csv')
Ticker = Index_tickers.ticker
for ticker in Index_tickers.ticker:
index_data = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\' + ticker + '_1.csv')
mkt_data = index_data[['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']]
numRow = 2
while numRow < endRow:
dOpen0 = mkt_data.ix[numRow, 'Open']
dHigh0 = mkt_data.ix[numRow, 'High']
dLow0 = mkt_data.ix[numRow, 'Low']
dClose0 = mkt_data.ix[numRow, 'Close']
dDate0 = mkt_data.ix[numRow, 'Date']
dTime0 = mkt_data.ix[numRow, 'Time']
dTicker0 = index_data.ix[numRow, 'Ticker']
dHaClose0 = (dOpen0 + dHigh0 + dLow0 + dClose0) / 4
dClose1 = mkt_data.ix[numRow -2 , 'Close']
dOpen1 = mkt_data.ix[numRow -2 , 'Open']
dHaOpen0 = (dClose1 + dOpen1) / 2
dHaHigh0 = max(dHigh0, dHaOpen0, dHaClose0)
dHaLow0 = min(dLow0, dHaOpen0, dHaClose0)
dHaGreen0 = dHaClose0 > dHaOpen0
dHaRed0 = dHaClose0 < dHaOpen0
dNumRow = numRow
numRow = numRow + 1
df = pd.DataFrame({'numRow' : pd.Series(dNumRow), 'Time' : pd.Series(dTime0), 'Date' : pd.Series(dDate0), 'Ticker' : pd.Series(dTicker0), 'Open0' : pd.Series(dOpen0), 'High0' : pd.Series(dHigh0), 'Low0' : pd.Series(dLow0), 'Close0' : pd.Series(dClose0)})
#print df
df.to_csv('C:\Users\\ME\\Dropbox\\MktData\HaDetail.csv')
any help hugely appreciated. Im new to python and learning on the job..
You are overwriting your csv on each iteration because the default mode is 'w' which will overwrite if it exists, additionally you are writing out your header and you only need to do this on the first iteration so I would do the following:
Index_tickers = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\Index_tickers.csv')
Ticker = Index_tickers.ticker
writeHeader = True
for ticker in Index_tickers.ticker:
index_data = pd.read_csv('C:\\Users\\ME\\Dropbox\\MktData\\Index_list\\' + ticker + '_1.csv')
mkt_data = index_data[['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume']]
numRow = 2
while numRow < endRow:
dOpen0 = mkt_data.ix[numRow, 'Open']
dHigh0 = mkt_data.ix[numRow, 'High']
dLow0 = mkt_data.ix[numRow, 'Low']
dClose0 = mkt_data.ix[numRow, 'Close']
dDate0 = mkt_data.ix[numRow, 'Date']
dTime0 = mkt_data.ix[numRow, 'Time']
dTicker0 = index_data.ix[numRow, 'Ticker']
dHaClose0 = (dOpen0 + dHigh0 + dLow0 + dClose0) / 4
dClose1 = mkt_data.ix[numRow -2 , 'Close']
dOpen1 = mkt_data.ix[numRow -2 , 'Open']
dHaOpen0 = (dClose1 + dOpen1) / 2
dHaHigh0 = max(dHigh0, dHaOpen0, dHaClose0)
dHaLow0 = min(dLow0, dHaOpen0, dHaClose0)
dHaGreen0 = dHaClose0 > dHaOpen0
dHaRed0 = dHaClose0 < dHaOpen0
dNumRow = numRow
numRow = numRow + 1
df = pd.DataFrame({'numRow' : pd.Series(dNumRow), 'Time' : pd.Series(dTime0), 'Date' : pd.Series(dDate0), 'Ticker' : pd.Series(dTicker0), 'Open0' : pd.Series(dOpen0), 'High0' : pd.Series(dHigh0), 'Low0' : pd.Series(dLow0), 'Close0' : pd.Series(dClose0)})
#print df
if writeHeader:
df.to_csv('C:\Users\\ME\\Dropbox\\MktData\HaDetail.csv')
writeHeader = False
else:
df.to_csv('C:\Users\\ME\\Dropbox\\MktData\HaDetail.csv', header=False, mode='a')
So we only write the header on first iteration and then for each subsequent iteration change the mode to 'a' so it appends to the file, see the docs

Categories

Resources