How to update linear regression line with live data - python

I'm retrieving live data to use it for further processing in a dataframe.
The first part (get_binance_bars function)gets the historical data where a linear regression line is fitted.
Now, I would like to have the linear regression line to be updated whenever the websocket receives data in. The changing live data is in df['live_price'].
How would you do this?
import websocket, json
import requests
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, date
from sklearn.linear_model import LinearRegression
symbol = "ETHUSDT"
tf = "1m"
now = datetime.now()
today = date.today()
d = int(today.strftime("%d"))
m = int(today.strftime("%m"))
y = int(today.strftime("%Y"))
hr = int(now.strftime("%H"))
mn = int(now.strftime("%M"))
def get_binance_bars(ticker, interval, startTime, endTime):
url = "https://api.binance.com/api/v3/klines"
startTime = str(int(startTime.timestamp() * 1000))
endTime = str(int(endTime.timestamp() * 1000))
limit = '1000'
req_params = {"symbol" : ticker, 'interval' : interval, 'startTime' : startTime, 'endTime' : endTime, 'limit' : limit}
df = pd.DataFrame(json.loads(requests.get(url, params = req_params).text))
if (len(df.index) == 0):
return None
df = df.iloc[:, 0:4]
df.columns = ['time', 'high', 'low', 'close']
df.close = df.close.astype("float")
df.low = df.low.astype("float")
df.high = df.high.astype("float")
global Y_pred
X = df.time.iloc[-20:].values.reshape(-1, 1)
Y = df.close.iloc[-20:].values.reshape(-1, 1)
linear_regressor = LinearRegression()
linear_regressor.fit(X, Y)
Y_pred = linear_regressor.predict(X)
df['Y_pred'] = np.nan
df.iloc[-20:, df.columns.get_loc('Y_pred')] = Y_pred
df.time = [dt.datetime.fromtimestamp(x / 1000.0) for x in df.time]
df.drop(df.tail(1).index,inplace=True) #cut last row to prevent double bar with live data
return df
SOCKET = "wss://stream.binance.com:9443/ws/"+symbol.lower()+"#kline_"+tf
df = get_binance_bars(symbol, tf, dt.datetime(y, m, d, hr-hr, mn), dt.datetime(y, m, d, hr, mn)) #define how many bars, hr-1 = 60 bars
def on_open(ws):
print('opened connection')
def on_close(ws):
print('closed connection')
def on_message(ws, message):
global df
global time_plot
global close
global low
global high
json_message = json.loads(message)
high = float(json_message['k']['h'])
low = float(json_message['k']['l'])
close = float(json_message['k']['c'])
time_plot = dt.datetime.fromtimestamp(json_message['k']['t']/1000).strftime('%H:%M')
df['live_price'] = close
df.iloc[-20:, df.columns.get_loc('Y_pred')] = Y_pred #<--- DOESN'T WORK
print(df)
ws = websocket.WebSocketApp(SOCKET, on_open=on_open, on_close=on_close, on_message=on_message)
ws.run_forever()

I would prefer to use Plotly for this sort of work. Please have a look at the Dash component with Interval for updating the graphs and plots. It would be useful in the longer run and making the dashboards.

Related

Python bot for trading

import yfinance as yf
import pandas as pd
dataF = yf.download("EURUSD=X", start="2022-12-22", end="2022-12-24", interval='60m')
print(dataF.iloc[:])
def signal_generator(df):
open = df.Open.iloc[-1]
close = df.Close.iloc[1]
one_open = df.Open.iloc[-2]
one_close = df.Close.iloc[-2]
# Bearish Pattern
if (open<=close and
one_open>one_close ):
return 1
# Bullish Pattern
elif (open>=close and
one_open<one_close
):
return 2
# No clear pattern
else:
return 0
signal = []
signal.append(0)
for i in range(1,len(dataF)):
df = dataF[i-1:i+1]
signal.append(signal_generator(df))
#signal_generator(data)
dataF["signal"] = signal
print(dataF.signal.value_counts())
in the first example seems like is working grabbing 2 candle stick but when i grab 4 as next code example it shows me an error .........
import yfinance as yf
import pandas as pd
dataF = yf.download("EURUSD=X", start="2022-12-22", end="2022-12-24", interval='60m')
print(dataF.iloc[:])
def signal_generator(df):
open = df.Open.iloc[-1]
close = df.Close.iloc[1]
one_open = df.Open.iloc[-2]
one_close = df.Close.iloc[-2]
two_open = df.Close.iloc[-3]
two_close = df.Close.iloc[-3]
three_open = df.Close.iloc[-3]
three_close = df.Close.iloc[-3]
# Bearish Pattern
if (open<=close and
one_open>one_close and
two_open<two_close and
three_open>=three_close):
return 1
# Bullish Pattern
elif (open>=close and
one_open<one_close and
two_open>two_close and
three_open<=three_close
):
return 2
# No clear pattern
else:
return 0
signal = []
signal.append(0)
for i in range(1,len(dataF)):
df = dataF[i-1:i+1]
signal.append(signal_generator(df))
#signal_generator(data)
dataF["signal"] = signal
print(dataF.signal.value_counts())
I believe the problem is in this line.......
df = dataF[i-1:i+1]
Try this and feed back.
from utils import *
import time
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
from pandas_datareader import data as wb
tickers = ['SBUX']
start = '2022-09-01'
end = '2022-12-13'
price_data = []
for ticker in tickers:
data = yf.download(ticker, start, end)
data = data.reset_index()
prices = data.loc[:,['Date','Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Date', 'Adj Close']])
df = pd.concat(price_data)
df.dtypes
df.head()
df.shape
# Technical Indicators
data = df
num_training_days = int(data.shape[0]*.7)
print('Number of training days: {}. Number of test days: {}.'.format(num_training_days, data.shape[0]-num_training_days))
data['ma7'] = data['Adj Close'].rolling(window=7).mean()
data['ma21'] = data['Adj Close'].rolling(window=21).mean()
# Create exponential weighted moving average
data['26ema'] = data['Adj Close'].ewm(span=26).mean()
data['12ema'] = data['Adj Close'].ewm(span=12).mean()
data['MACD'] = (data['12ema']-data['26ema'])
# Create Bollinger Bands
data['20sd'] = data['Adj Close'].rolling(window=20).std()
data['upper_band'] = data['ma21'] + (data['20sd']*2)
data['lower_band'] = data['ma21'] - (data['20sd']*2)
# Create Exponential moving average
data['ema'] = data['Adj Close'].ewm(com=0.5).mean()
# Create Momentum
data['momentum'] = data['Adj Close']-1
dataset_TI_df = data
dataset = data
#def plot_technical_indicators(dataset, last_days):
last_days = 250
plt.figure(figsize=(16, 10), dpi=100)
shape_0 = dataset.shape[0]
xmacd_ = shape_0-last_days
dataset = dataset.iloc[-last_days:, :]
x_ = range(3, dataset.shape[0])
x_ =list(dataset.index)
# Plot first subplot
plt.subplot(2, 1, 1)
plt.plot(dataset['ma7'],label='MA 7', color='g',linestyle='--')
plt.plot(dataset['Adj Close'],label='Closing Price', color='b')
plt.plot(dataset['ma21'],label='MA 21', color='r',linestyle='--')
plt.plot(dataset['upper_band'],label='Upper Band', color='c')
plt.plot(dataset['lower_band'],label='Lower Band', color='c')
plt.fill_between(x_, dataset['lower_band'], dataset['upper_band'], alpha=0.35)
plt.title('Technical indicators for Starbucks - last {} days.'.format(last_days))
plt.legend()
# Plot second subplot
plt.subplot(2, 1, 2)
plt.title('MACD')
plt.plot(dataset['MACD'],label='MACD', linestyle='-.')
plt.hlines(15, xmacd_, shape_0, colors='g', linestyles='--')
plt.hlines(-15, xmacd_, shape_0, colors='g', linestyles='--')
# plt.plot(dataset['log_momentum'],label='Momentum', color='b',linestyle='-')
plt.legend()
plt.show()
# Trade Signals
signalBuy = []
signalSell = []
position = False
for i in range(len(data)):
if data['ma7'][i] > data['ma21'][i]:
if position == False :
signalBuy.append(data['Adj Close'][i])
signalSell.append(np.nan)
position = True
else:
signalBuy.append(np.nan)
signalSell.append(np.nan)
elif data['ma7'][i] < data['ma21'][i]:
if position == True:
signalBuy.append(np.nan)
signalSell.append(data['Adj Close'][i])
position = False
else:
signalBuy.append(np.nan)
signalSell.append(np.nan)
else:
signalBuy.append(np.nan)
signalSell.append(np.nan)
data['Buy_Signal_price'] = signalBuy
data['Sell_Signal_price'] = signalSell
data
# Plotting Buy and Sell Points
fig, ax = plt.subplots(figsize=(14,8))
ax.plot(data['Adj Close'] , label = 'stock' ,linewidth=0.5, color='blue', alpha = 0.9)
ax.plot(data['ma7'], label = 'ma7', alpha = 0.85)
ax.plot(data['ma21'], label = 'ma21' , alpha = 0.85)
ax.scatter(data.index , data['Buy_Signal_price'] , label = 'Buy' , marker = '^', color = 'green',alpha =1 )
ax.scatter(data.index , data['Sell_Signal_price'] , label = 'Sell' , marker = 'v', color = 'red',alpha =1 )
ax.set_title(" Price History with buy and sell signals",fontsize=10, backgroundcolor='blue', color='white')
ax.set_xlabel(f'{startdate} - {end_date}' ,fontsize=18)
ax.set_ylabel('Close Price INR (₨)' , fontsize=18)
legend = ax.legend()
ax.grid()
plt.tight_layout()
plt.show()
moving_average_window = 30
data = df
# readjusting data Frame
data = data[["Adj Close"]]
# creating ** moving average
data["ma20"] = data["Adj Close"].rolling(window=moving_average_window).mean()
#calculating daily returns
data["daily returns"] = np.log(data["Adj Close"] / data["Adj Close"].shift(1))
data["position"] = [0] * len(data)
data.reset_index(inplace=True)
data = data.drop(["index"], axis=1)
pos_exit = False
pos = "N"
std = round(data["daily returns"].std(),4)
mean = round(data["daily returns"].mean(),4)
print("Std on daily returns :", std)
print("Mean on daily returns :", mean,"\n")
print(data.head(7))
# Event Driven Testing
for i in range(1, len(data)):
# Signal to go short and reset position
if pos_exit:
pos_exit = False
pos = "N"
continue
# going long, if return goes beyond lower bound
# (1 standard deviation). The asumption here is
# that the stock will revert back to its mean value
if data["Adj Close"][i] < ((1 - std) * data["ma20"][i]):
data.at[i, "position"] = 1
pos = "L"
# scenario if return in between lower and upper bounds
if pos == "L":
data.at[i, "position"] = 1
# updating strategy returns
data["strategy returns"] = data["daily returns"] * data["position"]
# exiting if the strategy return drops by 3%
if data["strategy returns"][i] < -0.03:
data.at[i, "position"] = 0
pos_exit = True
data.tail(10)
# taking positions after one day of signals being generated
data["position"].shift(1)
print("Buy and hold returns =",round(list(data["daily returns"].cumsum())[-1],4)*100,"%")
print("Strategy returns =", round(list(data["strategy returns"].cumsum())[-1],4)*100,"%")

plotting in real-time two plots on the same figure using multi-processing python

currently I'm working on plotting the data from two sensors (audio and vibration) on the same graph in real-time , but I'm facing a problem:
this is my server :
import socket as s
import math as M
import struct
import time
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from datetime import datetime, tzinfo
import pytz
import socket
import sys
import json
import csv
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from datetime import datetime
from datetime import timedelta
import math as M
...
import multiprocessing
from multiprocessing import Lock , Process, Queue
timel = []
visiond = []
import csv
# -----------
freq = 50
seconds = 60 # number of seconds of data to display
gain = 10
wlen = freq / 100
per_lap = 0.9
ch = 'SHZ'
mult = 8.0
displ_samps = seconds * freq
save = False
# -----------
# -----------
def _nearest_pow_2(x):
"""
Find power of two nearest to x
>>> _nearest_pow_2(3)
2.0
>>> _nearest_pow_2(15)
16.0
:type x: float
:param x: Number
:rtype: Int
:return: Nearest power of 2 to x
"""
a = M.pow(2, M.ceil(np.log2(x)))
b = M.pow(2, M.floor(np.log2(x)))
if abs(a - x) < abs(b - x):
return a
else:
return b
nfft1 = int(_nearest_pow_2(wlen * freq))
nlap1 = int(nfft1 * per_lap)
if mult is not None:
mult = int(_nearest_pow_2(mult))
mult = mult * nfft1
# -------------------
# ------------
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(13, 6))
plt.ion()
fig.show()
fig.canvas.draw()
n = 0
stream = [] # our stream (simple list object)
curr = datetime.now()
# --------------
# sudo iptables -A INPUT -p tcp -s 192.168.0.9 --dport 50012 -j ACCEPT
# Set up a TCP/IP server
tcp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
aud_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
dur = 0.02
# Bind the socket to server address and port 81
server_address = (localhost, 50021)
server_address_aud = (localhost, 50022)
tcp_socket.bind(server_address)
aud_socket.bind(server_address_aud)
# Listen on port 81
tcp_socket.listen(1)
aud_socket.listen(1)
def update_vib(tcp_socket,n,q,lock,curr,stream):
while True:
print("Waiting for connection")
connection, client = tcp_socket.accept()
try:
print("Connected to client IP: {}".format(client))
if connection:
current_time = datetime.now(tz=pytz.UTC)
future_time = current_time + timedelta(seconds=1)
xx = future_time.strftime("%m/%d/%Y, %H:%M:%S")
str_1_encoded = bytes(xx, 'UTF-8')
connection.sendall(str_1_encoded)
# Receive and print data 32 bytes at a time, as long as the client is sending something
while True:
data = connection.recv(1024)
sample = bytes(data).decode("utf-8")
sample = (sample.replace('{', '').replace('}', '').replace(' ', '').split(','))
sample.remove("'SHZ'")
initial_time = float(sample.pop(0))
for i in range(0, len(sample)):
ts = initial_time + i * dur
send_tcp = '%.3f, %s' % (ts, sample[i])
id_machine = '4'
sample_json = {}
sample_json['id'] = id_machine
sample_json['type'] = 'vib'
dataf = send_tcp.split(",")
sample_json['sample_time'] = dataf[0]
sample_json['sample_data'] = dataf[1]
sample_str = json.dumps(sample_json)
str_1_encoded = bytes(sample_str, 'UTF-8')
# dataf = send_tcp.split(",")
# timel.append(float(dataf[0]))
# visiond.append(int(dataf[1]))
# drawnow(makeFig)
# plt.pause(.000001)
fulldata = json.loads(sample_str)
timel.append((fulldata['sample_time']))
visiond.append((fulldata['sample_data']))
if not data:
break
s = data.decode('UTF-8').strip("'{}").split(', ')
if ch in s[0]: # only listen to the specific channel
prev = curr # timing
curr = datetime.now() # timing
fps = 1 / (curr - prev).total_seconds() # timing
Text = "hi"
for smp in s[2:]: # convert strings to ints
stream.append(int(smp))
npts = len(stream)
if npts > displ_samps: # if the number of samples exceeds the display value, slice the array
stream = stream[npts - displ_samps:] # move the array to the right, get rid of old samples
lock.acquire()
plotting_figures()
lock.release()
finally:
connection.close()
amp22=[]
audiotime=[]
def recived_aud(aud_socket,lock) :
while True:
data = b''
payload_size2 = struct.calcsize("dd")
connection, client = aud_socket.accept()
print("Connected to client IP: {}".format(client))
if connection :
while True:
while len(data) < payload_size2:
data = connection.recv(1024)
if not data: break
packed_msg_size = data[:payload_size2]
data = data[payload_size2:]
tup = struct.unpack('dd', packed_msg_size)
audiotime.append(float(tup[0]))
amp22.append(float(tup[1]))
print(tup)
lock.acquire()
plotting_figures()
lock.release()
def plotting_figures():
if stream:
ax[0].clear() # ready the plot axis for a new draw
ax[0].set_xlim(0, len(stream)) # being explicit here helps speed things up slightly
ax[0].set_ylim(min(stream) - 25, max(stream) + 25)
ax[0].plot(stream, linewidth=0.5)
else:
ax[0].clear()
if amp22:
ax[1].clear() # ready the plot axis for a new draw
ax[1].set_xlim(0, len(amp22)) # being explicit here helps speed things up slightly
ax[1].set_ylim(min(amp22) - 25, max(amp22) + 25)
ax[1].plot(amp22, linewidth=0.5)
else:
ax[1].clear()
fig.canvas.draw()
lock=Lock()
q=Queue()
q.put([fig,ax])
qq=q.get()
p1=Process(target=update_vib, args=(tcp_socket,n,q,lock,curr,stream,))
p2=Process(target=recived_aud, args=(aud_socket,lock))
p1.start()
p2.start()
p1.join()
p2.join()
the error I'm receiving:
[xcb] Unknown sequence number while processing queue
[xcb] Most likely this is a multi-threaded client and XInitThreads has not been called
[xcb] Aborting, sorry about that.
python3.8: ../../src/xcb_io.c:260: poll_for_event: Assertion `!xcb_xlib_threads_sequence_lost' failed.
I'm aware that matplotlib does not support multi-threading but I'm using multi-processing instead , please let me know how I can update the two plot on the same figure simultaneously.
and thank you

Best approach to iterate and append a custom function a on a new dataframe

I have the following custom function that generates a row with EMA data for a specific asset based on the current time.
Here's the complete code for the function:
def find_ema(futures_symbol):
futures_symbol = futures_symbol
def fetch_ohlc(symbol,timeframe, timesymbol):
symbol = symbol
timeframe = timeframe
timesymbol = timesymbol
#fetch data-binance api
candlestick_url = 'https://fapi.binance.com/fapi/v1/continuousKlines?pair='+symbol+'&contractType=PERPETUAL&interval='+str(timeframe)+timesymbol+'&limit=1500'
candlestick_chart = requests.get(candlestick_url).json()
candlestick_df = pd.DataFrame(candlestick_chart)
candlestick_df = candlestick_df.iloc[:,1:7]
candlestick_df.columns = ['open', 'high', 'low','close','volume', 'date']
candlestick_df['date'] = pd.to_datetime(candlestick_df['date'], unit='ms').round('1s')
candlestick_df.insert(0, 'date', candlestick_df.pop('date') )
# reset to midnight
candlestick_df.date = pd.to_datetime(candlestick_df.date)
min_date = candlestick_df.date.min()
NextDay_Date = (min_date + datetime.timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
candlestick_df = candlestick_df[candlestick_df.date >= NextDay_Date].copy()
candlestick_df = candlestick_df.set_index('date')
candlestick_df['symbol'] = symbol
ohlc_data = candlestick_df
cols= ['open', 'high', 'low','close','volume']
ohlc_data[cols] = ohlc_data[cols].apply(pd.to_numeric, errors = 'coerce')
ohlc_data[cols] = ohlc_data[cols].round(decimals=2)
return ohlc_data
#separate df for limited candle stick data
ohlc_smaller = fetch_ohlc(futures_symbol,5,'m')
ohlc_larger = fetch_ohlc(futures_symbol,1,'h')
ema_df = ohlc_smaller
#calculating ema with 200 row data
ema_df['15m'] = ohlc_smaller.resample('15T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['30m'] = ohlc_smaller.resample('30T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['1h'] = ohlc_larger.resample('60T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['2h'] = ohlc_larger.resample('120T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['4h'] = ohlc_larger.resample('240T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
#forward fill larger tf data to smaller tf
ema_df = ema_df.fillna(method='ffill').tail(1)
ema_df.insert(0, 'symbol', ema_df.pop('symbol'))
ema_df = ema_df.drop(['high','low','close','volume'], axis=1)
return ema_df
When I apply this function to a single symbol it returns the dataframe perfectly. for example:
working example on single symbol
However, I now have a list of symbols on which I want to apply this function and create a new dataframe.
Here's how I am generating my list of symbols:
symbols_url = 'https://fapi.binance.com/fapi/v1/ticker/price'
symbols_data = requests.get(symbols_url).json()
symbols_df = pd.DataFrame(symbols_data)
symbols_df = symbols_df[symbols_df['symbol'].str.contains('USDT')]
futures_tickers_binance = list(symbols_df['symbol'])
#some ends with numbers(eg:Quarterly Contracts), hence filter:
futures_tickers_binance = list(filter(lambda x: x.endswith(('USDT')), futures_tickers_binance))
Here's what I thought would work:
for symbol in futures_tickers_binance:
for j in range(len(futures_tickers_binance)):
df = df.append(find_ema(futures_tickers_binance[j]))
df = df.drop_duplicates()
However, this returns a valueError:
ValueError: If using all scalar values, you must pass an index
Is there a way to apply this function and generate a new dataframe with the values for the complete list in a faster way?
Thank you in advance for your patience to read this!
The final result would look something like this, however my loop is not working the way it is supposed to be working:
Expected (almost) perfect result
Here's my complete code if needed:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import datetime
import requests
symbols_url = 'https://fapi.binance.com/fapi/v1/ticker/price'
symbols_data = requests.get(symbols_url).json()
symbols_df = pd.DataFrame(symbols_data)
symbols_df = symbols_df[symbols_df['symbol'].str.contains('USDT')]
futures_tickers_binance = list(symbols_df['symbol'])
#some ends with numbers(eg:Quarterly Contracts), hence filter:
futures_tickers_binance = list(filter(lambda x: x.endswith(('USDT')), futures_tickers_binance))
def find_ema(futures_symbol):
futures_symbol = futures_symbol
def fetch_ohlc(symbol,timeframe, timesymbol):
symbol = symbol
timeframe = timeframe
timesymbol = timesymbol
#fetch data-binance api
candlestick_url = 'https://fapi.binance.com/fapi/v1/continuousKlines?pair='+symbol+'&contractType=PERPETUAL&interval='+str(timeframe)+timesymbol+'&limit=1500'
candlestick_chart = requests.get(candlestick_url).json()
candlestick_df = pd.DataFrame(candlestick_chart)
candlestick_df = candlestick_df.iloc[:,1:7]
candlestick_df.columns = ['open', 'high', 'low','close','volume', 'date']
candlestick_df['date'] = pd.to_datetime(candlestick_df['date'], unit='ms').round('1s')
candlestick_df.insert(0, 'date', candlestick_df.pop('date') )
# reset to midnight
candlestick_df.date = pd.to_datetime(candlestick_df.date)
min_date = candlestick_df.date.min()
NextDay_Date = (min_date + datetime.timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
candlestick_df = candlestick_df[candlestick_df.date >= NextDay_Date].copy()
candlestick_df = candlestick_df.set_index('date')
candlestick_df['symbol'] = symbol
ohlc_data = candlestick_df
cols= ['open', 'high', 'low','close','volume']
ohlc_data[cols] = ohlc_data[cols].apply(pd.to_numeric, errors = 'coerce')
ohlc_data[cols] = ohlc_data[cols].round(decimals=2)
return ohlc_data
#separate df for limited candle stick data
ohlc_smaller = fetch_ohlc(futures_symbol,5,'m')
ohlc_larger = fetch_ohlc(futures_symbol,1,'h')
ema_df = ohlc_smaller
#calculating ema with 200 row data
ema_df['15m'] = ohlc_smaller.resample('15T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['30m'] = ohlc_smaller.resample('30T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['1h'] = ohlc_larger.resample('60T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['2h'] = ohlc_larger.resample('120T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['4h'] = ohlc_larger.resample('240T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
#forward fill larger tf data to smaller tf
ema_df = ema_df.fillna(method='ffill').tail(1)
ema_df.insert(0, 'symbol', ema_df.pop('symbol'))
ema_df = ema_df.drop(['high','low','close','volume'], axis=1)
return ema_df
for symbol in futures_tickers_binance:
for j in range(len(futures_tickers_binance)):
df = df.append(find_ema(futures_tickers_binance[j]))
df = df.drop_duplicates()

Can the csv data be saved in dictionary for multiprocessing to save I/O operations and speed up the process?

I have around 1500 csv files with OHLC data of stock which contains 90000-100000 rows each.
Below the multiprocessing code to process each of the files ( with number of iterations ). When I tried to use 16 processess, my system started to hang a bit. I am very sure that its because of high use of I/O devices ( since system has to open each and every file ). Is it a good idea to save all the 1500 csv files to one one Dictionary and then run the code ? Can it reduce the time or slow down the hanging process ?
Also, system is working fine on 10 processes.
Here is the ohlc data look like -
enter image description here
import numpy as np
import pandas as pd
import os
import multiprocessing
import datetime
import itertools
import time
import warnings
warnings.filterwarnings('ignore')
# bank nifty
bn_futures = pd.read_csv('E:\\Tanmay\\Data\\Bank Nifty Index\\BankNifty_Futures GFDL 2011-2020.csv')
bn_futures['Date_time'] = bn_futures['Date'] + ' ' + bn_futures['Time']
bn_futures['Date_time'] = pd.to_datetime(bn_futures['Date_time'],format='%Y-%m-%d %H:%M:%S')
bn_futures = bn_futures[bn_futures['Date_time'].dt.date > datetime.date(2016,5,26)]
req_cols = [x for x in bn_futures.columns if 'Unnamed' not in x]
bn_futures = bn_futures[req_cols]
bn_futures['straddle'] = round(bn_futures['Close'],-2)
bn_futures['straddle'] = bn_futures['straddle'].astype(int)
bn_futures['straddle'] = bn_futures['straddle'].astype(str)
bn_futures['Date'] = bn_futures['Date_time'].dt.date
dates = list(set(bn_futures['Date'].to_list()))
dates.sort()
option_files1 = os.listdir('E:\\\\2nd Set\\')
option_files = []
for i in option_files1:
if datetime.datetime.strptime(i.split('.')[0],'%Y-%m-%d').date() >= datetime.date(2016,5,27):
option_files.append(i)
def time_loop(start_time,end_time,timeframe):
start_datetime = datetime.datetime.combine(datetime.datetime.today().date(),start_time)
end_datetime = datetime.datetime.combine(datetime.datetime.today().date(),end_time)
difference = int((((end_datetime - start_datetime).total_seconds())/60)/timeframe)
final_time_list = []
for i in range(difference):
final_time_list.append((start_datetime+datetime.timedelta(minutes=i*timeframe)).time())
return final_time_list
entry_time_list = time_loop(datetime.time(9,19),datetime.time(15,19),5)
sl_list = np.arange(1.1, 2, 0.1)
# sl_list = list(range(1.1,2,0.1))
paramlist = list(itertools.product(entry_time_list,sl_list))
def strategy(main_entry_time,sl):
print(main_entry_time,sl)
main_dict = {}
for file in option_files:
date = datetime.datetime.strptime(file.split('.')[0],'%Y-%m-%d').date()
try:
# reading current date bn futures
bn = bn_futures[bn_futures['Date'] == date]
# reading main time bn futures
b = bn[bn['Date_time'].dt.time == main_entry_time]
straddle_value = b['straddle'].iloc[0]
df = pd.read_csv('E:\\Tanmay\\Data\\Bank nifty Intraday All expiries\\2nd Set\\'+file)
df['Date_time'] = pd.to_datetime(df['Date_time'],format='%Y-%m-%d %H:%M:%S')
h = [k for k in df.columns if 'Un' not in k]
df = df[h]
total_df = df[(df['Ticker'].str.contains(straddle_value)) & (df['Expiry_number'] == 0) & (df['W/M'] == 'W')]
option_types = ['CE','PE']
for option in option_types:
option_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time == main_entry_time)]
entry_price = option_df['Close'].iloc[0]
strike = option
entry_time = main_entry_time
trade_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time > main_entry_time)]
trade_df.sort_values(by='Date_time',inplace=True)
for t in trade_df.index:
if trade_df['Date_time'][t].time() > entry_time:
if trade_df['High'][t] > entry_price * sl:
exit_price = entry_price * sl
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'SL'}
break
if trade_df['Date_time'][t].time() >= datetime.time(15,14,0):
exit_price = trade_df['Close'][t]
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'EOD'}
break
except Exception as yy:
pass
final_dict = dict(main_dict)
final_df = pd.DataFrame(final_dict)
final_df = final_df.transpose()
final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
if __name__=='__main__':
start_time = time.time()
# mgr = multiprocessing.Manager()
# main_dict = mgr.dict()
total_data = paramlist
p = multiprocessing.Pool(processes=10)
p.starmap(strategy,total_data)
p.close()
Before you can improve the multiprocessing performance, you should make sure your serial implementation is as efficient as it can be. Have you done that?
Your strategy method now is rereading every option file repeatedly for each element of total_data that is being passed to it. This is highly inefficient but moreover it might be contributing significantly to what is stalling your I/O (it depends on caching, which I discuss later). What if the data was put in a database and read up front and perhaps stored in a dictionary initialized at the beginning?
As far as strategy writing out the CSV file, it should be returning the input parameters and the final_df back to the main process so it can do all the I/O. For this function imap_unordered with a suitable chunksize argument is better suited so that the main process can write the results as they become available. Because we are no longer using method starmap, strategy will now be passed a tuple that will have to be unpacked:
import numpy as np
import pandas as pd
import os
import multiprocessing
import datetime
import itertools
import time
import warnings
warnings.filterwarnings('ignore')
# bank nifty
if __name__ == '__main__':
bn_futures = pd.read_csv('E:\\Tanmay\\Data\\Bank Nifty Index\\BankNifty_Futures GFDL 2011-2020.csv')
bn_futures['Date_time'] = bn_futures['Date'] + ' ' + bn_futures['Time']
bn_futures['Date_time'] = pd.to_datetime(bn_futures['Date_time'],format='%Y-%m-%d %H:%M:%S')
bn_futures = bn_futures[bn_futures['Date_time'].dt.date > datetime.date(2016,5,26)]
req_cols = [x for x in bn_futures.columns if 'Unnamed' not in x]
bn_futures = bn_futures[req_cols]
bn_futures['straddle'] = round(bn_futures['Close'],-2)
bn_futures['straddle'] = bn_futures['straddle'].astype(int)
bn_futures['straddle'] = bn_futures['straddle'].astype(str)
bn_futures['Date'] = bn_futures['Date_time'].dt.date
dates = list(set(bn_futures['Date'].to_list()))
dates.sort()
option_files1 = os.listdir('E:\\\\2nd Set\\')
option_files = []
for i in option_files1:
if datetime.datetime.strptime(i.split('.')[0],'%Y-%m-%d').date() >= datetime.date(2016,5,27):
option_files.append(i)
def time_loop(start_time,end_time,timeframe):
start_datetime = datetime.datetime.combine(datetime.datetime.today().date(),start_time)
end_datetime = datetime.datetime.combine(datetime.datetime.today().date(),end_time)
difference = int((((end_datetime - start_datetime).total_seconds())/60)/timeframe)
final_time_list = []
for i in range(difference):
final_time_list.append((start_datetime+datetime.timedelta(minutes=i*timeframe)).time())
return final_time_list
entry_time_list = time_loop(datetime.time(9,19),datetime.time(15,19),5)
sl_list = np.arange(1.1, 2, 0.1)
# sl_list = list(range(1.1,2,0.1))
paramlist = list(itertools.product(entry_time_list,sl_list))
def init_pool_processes(o_f):
global option_files
option_files = o_f
def strategy(tpl):
main_entry_time, sl = tpl # unpack tuple
print(main_entry_time,sl)
main_dict = {}
for file in option_files:
date = datetime.datetime.strptime(file.split('.')[0],'%Y-%m-%d').date()
try:
# reading current date bn futures
bn = bn_futures[bn_futures['Date'] == date]
# reading main time bn futures
b = bn[bn['Date_time'].dt.time == main_entry_time]
straddle_value = b['straddle'].iloc[0]
df = pd.read_csv('E:\\Tanmay\\Data\\Bank nifty Intraday All expiries\\2nd Set\\'+file)
df['Date_time'] = pd.to_datetime(df['Date_time'],format='%Y-%m-%d %H:%M:%S')
h = [k for k in df.columns if 'Un' not in k]
df = df[h]
total_df = df[(df['Ticker'].str.contains(straddle_value)) & (df['Expiry_number'] == 0) & (df['W/M'] == 'W')]
option_types = ['CE','PE']
for option in option_types:
option_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time == main_entry_time)]
entry_price = option_df['Close'].iloc[0]
strike = option
entry_time = main_entry_time
trade_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time > main_entry_time)]
trade_df.sort_values(by='Date_time',inplace=True)
for t in trade_df.index:
if trade_df['Date_time'][t].time() > entry_time:
if trade_df['High'][t] > entry_price * sl:
exit_price = entry_price * sl
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'SL'}
break
if trade_df['Date_time'][t].time() >= datetime.time(15,14,0):
exit_price = trade_df['Close'][t]
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'EOD'}
break
except Exception as yy:
pass
#final_dict = dict(main_dict) # why make a copy?
final_df = pd.DataFrame(main_dict)
final_df = final_df.transpose()
#final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
return (sl, date, final_df)
def compute_chunksize(iterable_size, pool_size):
chunksize, remainder = divmod(iterable_size, 4 * pool_size)
if remainder:
chunksize += 1
return chunksize
if __name__=='__main__':
start_time = time.time()
# mgr = multiprocessing.Manager()
# main_dict = mgr.dict()
total_data = paramlist
POOL_SIZE = 10
p = multiprocessing.Pool(processes=POOL_SIZE, initializer=init_pool_processes, initargs=(option_list,))
chunksize = compute_chunksize(len(total_data), POOL_SIZE)
results = p.imap_unordered(strategy, total_data, chunksize=chunksize)
for sl, date, final_df in results:
final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
p.close()
p.join()
Since your are running under Windows, as I have mentioned before, code at global scope will be executed by each pool process as part of its initialization and so it is inefficient to have code that is not required by your worker function, strategy at global scope that is not contained within a if __name__ == '__main__': block. So that is what I have done. Since your worker function does need to reference (for the time being until the issue I initially raised is addressed) option_files, I have used the initializer and initargs arguments of the Pool constructor so that after the option_files list is created once by the main process only, it will be copied to each process in the pool used to initialize a global variable option_files.
But I cannot stress enough that you should figure out a way of eliminating the reading of the files in the option_files list repeatedly. Ideally, building a dictionary of some sort that can be passed as another argument to init_process_pools so that each pool process has access to a copy of the dictionary once constructed would be ideal. What might save you is that Windows will cache data. Depending on the cache size and the size of the CSV files, the I/O bottleneck may not be as big as a problem as it might otherwise be.
In the meanwhile, you could experiment and force "single threading" of reading and writing of data by using a multirprocessing.Lock with the following changes. If Windows is caching all the reads after the first time you read all the option files, it will probably not make too big a difference. The posted code above, which single threads the writing, however, should help.
if __name__=='__main__':
start_time = time.time()
# mgr = multiprocessing.Manager()
# main_dict = mgr.dict()
total_data = paramlist
POOL_SIZE = 10
io_lock = multiprocessing.Lock()
p = multiprocessing.Pool(processes=POOL_SIZE, initializer=init_pool_processes, initargs=(option_list, io_lock))
chunksize = compute_chunksize(len(total_data), POOL_SIZE)
results = p.imap_unordered(strategy, total_data, chunksize=chunksize)
for sl, date, final_df in results:
with io_lock:
final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
p.close()
p.join()
And:
def init_pool_processes(o_f, lock):
global option_files, io_lock
option_files = o_f
io_lock = lock
And finally:
def stratgey(tpl):
...
straddle_value = b['straddle'].iloc[0]
with io_lock:
df = pd.read_csv('E:\\Tanmay\\Data\\Bank nifty Intraday All expiries\\2nd Set\\'+file)

Python Pandas NameError: name 'data' is not defined

I'm new to coding. When I attempt to run this it says:
NameError: name 'data' is not defined.
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import datetime
import json
from bs4 import BeautifulSoup
import requests
import time
def fetchCryptoClose(fsym, tsym):
# function fetches the close-price time-series from cryptocompare.com
# it may ignore USDT coin (due to near-zero pricing)
# daily sampled
cols = ['date', 'timestamp', fsym]
lst = ['time', 'open', 'high', 'low', 'close']
timestamp_today = datetime.today().timestamp()
curr_timestamp = timestamp_today
for j in range(2):
df = pd.DataFrame(columns=cols)
url = "https://min-api.cryptocompare.com/data/histoday?fsym=" + fsym + \
"&tsym=" + tsym + "&toTs=" + str(int(curr_timestamp)) + "&limit=3"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
dic = json.loads(soup.prettify())
for i in range(1, 4):
tmp = []
for e in enumerate(lst):
x = e[0]
y = dic['Data'][i][e[1]]
if(x == 0):
tmp.append(str(timestamp2date(y)))
tmp.append(y)
if(np.sum(tmp[-4::]) > 0): # remove for USDT
tmp = np.array(tmp)
tmp = tmp[[0,1,4]] # filter solely for close prices
df.loc[len(df)] = np.array(tmp)
# ensure a correct date format
df.index = pd.to_datetime(df.date, format="%Y-%m-%d")
df.drop('date', axis=1, inplace=True)
curr_timestamp = int(df.ix[0][0])
if(j == 0):
df0 = df.copy()
else:
data = pd.concat([df, df0], axis=0)
data.drop("timestamp", axis=1, inplace=True)
return data # DataFrame
# N-Cryptocurrency Portfolio (tickers)
fsym = ['BTC', 'ETH', 'XRP', 'LTC', 'DASH', 'XMR', 'ETC', 'MAID', 'XEM', 'REP']
# vs.
tsym = 'USD'
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
try:
data = fetchCryptoClose(e[1], tsym)
except:
pass
else:
try:
data = data.join(fetchCryptoClose(e[1], tsym))
except:
pass
# ensure values to be floats
# save portfolio to a file (HDF5 file format)
store = pd.HDFStore('portfolio2.h5')
store['data'] = data
store.close()
# read in your portfolio from a file
df = pd.read_hdf('portfolio2.h5', 'data')
print(df)
Don't use try-except-pass because will silence all your exceptions and you might never actually create `data.
Replace this code:
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
try:
data = fetchCryptoClose(e[1], tsym)
except:
pass
else:
try:
data = data.join(fetchCryptoClose(e[1], tsym))
except:
pass
with this:
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
data = fetchCryptoClose(e[1], tsym)
else:
data = data.join(fetchCryptoClose(e[1], tsym))
and see where your real exceptions are.

Categories

Resources