I need to use my own data for a zipline project. I keep getting this error whenever I try:
/Library/Python/2.7/site-packages/zipline/sources/data_source.pyc in <dictcomp>((target, (mapping_func, source_key)))
47 """
48 row = {target: mapping_func(raw_row[source_key])
---> 49 for target, (mapping_func, source_key)
50 in self.mapping.items()}
51 row.update({'source_id': self.get_hash()})
ValueError: cannot convert float NaN to integer
Here is the trading algorithm I am running:
from zipline.algorithm import TradingAlgorithm
from zipline.api import order_target, order, record, symbol, history, add_history
import numpy as np
from pandas import Series, DataFrame, Panel
import pandas as pd
# Define algorithm
def initialize(context):
context.dateIndex = 0
def handle_data(context, data):
today = data.major_axis[context.dateIndex]
if today > data.US9663871021[data.US9663871021.close.notnull()].index[0] and today < data.US9663871021[data.US9663871021.close.notnull()].last_valid_index():
order(symbol('US9663871021'), 10)
record(US9663871021=data[symbol('US9663871021')].price)
if today > data.US7954351067[data.US7954351067.close.notnull()].index[0] and today < data.US7954351067[data.US7954351067.close.notnull()].last_valid_index():
order(symbol('US7954351067'), 10)
record(US7954351067=data[symbol('US7954351067')].price)
if today == data.US9663871021[data.US9663871021.close.notnull()].last_valid_index():
order_target(symbol('US9663871021'), 0)
record(US9663871021=data[symbol('US9663871021')].price)
if today == data.US7954351067[data.US7954351067.close.notnull()].last_valid_index():
order_target(symbol('US7954351067'), 0)
record(US9663871021=data[symbol('US7954351067')].price)
context.dateIndex = context.dateIndex + 1
def prepDf(fileName):
df = pd.io.parsers.read_csv(fileName, index_col=[0],parse_dates=[0], na_values=["#N/A N/A"],
names=["date", "open","high","low","close","volume","mkt_cap"])
df["price"] = df.close
df.index = df.index.tz_localize('UTC')
df = df[df.close.notnull()]
return df
fileName = #fill in file name
fileName2 = #fill in file name
dictionaryOfDfs = {"US9663871021" : prepDf(fileName), "US7954351067": prepDf(fileName2)}
data = Panel(dictionaryOfDfs)
algo_obj = TradingAlgorithm(initialize=initialize,
handle_data=handle_data)
# Run algorithm
perf_manual = algo_obj.run(data)
The idea is that I'm buying when the data should be non-NaN and selling the position before the end of the series. There should be no need for the data beyond that, yet zipline insists that NaN causes an error even when the value shouldn't be used.
After researching, I believe the solution is to re-index the underlying DataFrames:
df1 = df1.reindex(index=data.major_axis, fill_value=0)
df2 = df2.reindex(index=data.major_axis, fill_value=0)
where data is the pandas Panel
Related
I have a pandas dataframe with daily data. At the last day of each month, I would like to compute a quantity that depends on the daily data of the previous n months (e.g., n=3).
My current solution is to use the pandas rolling function to compute this quantity for every day, and then, only keep the quantities of the last days of each month (and discard all the other quantities). This however implies that I perform a lot of unnecessary computations.
Does somebody of you know how I can improve that?
Thanks a lot in advance!
EDIT:
In the following, I add two examples. In both cases, I compute rolling regressions of stock returns. The first (short) example shows the problem described above and is a sub-problem of my actual problem. The second (long) example shows my actual problem. Therefore, I would either need a solution of the first example that can be embedded in my algorithm for solving the second example or a completely different solution of the second example. Note: The dataframe that I'm using is very large, which means that multiple copies of the entire dataframe are not feasible.
Example 1:
import pandas as pd
import random
import statsmodels.api as sm
# Generate a time index
dates = pd.date_range("2018-01-01", periods=365, freq="D", name='date')
df = pd.DataFrame(index=dates,columns=['Y','X']).sort_index()
# Generate Data
df['X'] = np.array(range(0,365))
df['Y'] = 3.1*X-2.5
df = df.iloc[random.sample(range(365),280)] # some days are missing
df.iloc[random.sample(range(280),20),0] = np.nan # some observations are missing
df = df.sort_index()
# Compute Beta
def estimate_beta(ser):
return sm.OLS(df.loc[ser.index,'Y'], sm.add_constant(df.loc[ser.index,'X']), missing = 'drop').fit().params[-1]
df['beta'] = df['Y'].rolling('60D', min_periods=10).apply(estimate_beta) # use last 60 days and require at least 10 observations
# Get last entries per month
df_monthly = df[['beta']].groupby([pd.Grouper(freq='M', level='date')]).agg('last')
df_monthly
Example 2:
import pandas as pd
from pandas import IndexSlice as idx
import random
import statsmodels.api as sm
# Generate a time index
dates = pd.date_range("2018-01-01", periods=365, freq="D", name='date')
arrays = [dates.tolist()+dates.tolist(),["10000"]*365+["10001"]*365]
index = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["Date", "Stock"])
df = pd.DataFrame(index=index,columns=['Y','X']).sort_index()
# Generate Data
df.loc[idx[:,"10000"],'X'] = X = np.array(range(0,365)).astype(float)
df.loc[idx[:,"10000"],'Y'] = 3*X-2
df.loc[idx[:,"10001"],'X'] = X
df.loc[idx[:,"10001"],'Y'] = -X+1
df = df.iloc[random.sample(range(365*2),360*2)] # some days are missing
df.iloc[random.sample(range(280*2),20*2),0] = np.nan # some observations are missing
# Estimate beta
def estimate_beta_grouped(df_in):
def estimate_beta(ser):
return sm.OLS(df.loc[ser.index,'Y'].astype(float),sm.add_constant(df.loc[ser.index,'X'].astype(float)), missing = 'drop').fit().params[-1]
df = df_in.droplevel('Stock').reset_index().set_index(['Date']).sort_index()
df['beta'] = df['Y'].rolling('60D',min_periods=10).apply(estimate_beta)
return df[['beta']]
df_beta = df.groupby(level='Stock').apply(estimate_beta_grouped)
# Extract beta at last day per month
df_monthly = df.groupby([pd.Grouper(freq='M', level='Date'), df.index.get_level_values(1)]).agg('last') # get last observations
df_monthly = df_monthly.merge(df_beta, left_index=True, right_index=True, how='left') # merge beta on df_monthly
df_monthly
Noob here. PLEASE FORGIVE ABYSMAL FORMATTING as I am still learning. I am trying to create a time series (a dataframe, I think?) that consists of three columns. One is a date column, the next is an inventory column, and the last is a price column.
I have pulled two separate series (date & inventory; date & price) and I want to meld the two series so that I can see three columns instead of two sets of two. This is my code.
import json
import numpy as np
import pandas as pd
from urllib.error import URLError, HTTPError
from urllib.request import urlopen
class EIAgov(object):
def __init__(self, token, series):
'''
Purpose:
Initialise the EIAgov class by requesting:
- EIA token
- id code(s) of the series to be downloaded
Parameters:
- token: string
- series: string or list of strings
'''
self.token = token
self.series = series
def __repr__(self):
return str(self.series)
def Raw(self, ser):
# Construct url
url = 'http://api.eia.gov/series/?api_key=' + self.token + '&series_id=' + ser.upper()
try:
# URL request, URL opener, read content
response = urlopen(url);
raw_byte = response.read()
raw_string = str(raw_byte, 'utf-8-sig')
jso = json.loads(raw_string)
return jso
except HTTPError as e:
print('HTTP error type.')
print('Error code: ', e.code)
except URLError as e:
print('URL type error.')
print('Reason: ', e.reason)
def GetData(self):
# Deal with the date series
date_ = self.Raw(self.series[0])
date_series = date_['series'][0]['data']
endi = len(date_series) # or len(date_['series'][0]['data'])
date = []
for i in range (endi):
date.append(date_series[i][0])
# Create dataframe
df = pd.DataFrame(data=date)
df.columns = ['Date']
# Deal with data
lenj = len(self.series)
for j in range (lenj):
data_ = self.Raw(self.series[j])
data_series = data_['series'][0]['data']
data = []
endk = len(date_series)
for k in range (endk):
data.append(data_series[k][1])
df[self.series[j]] = data
return df
if __name__ == '__main__':
tok = 'mytoken'
# Natural Gas - Weekly Storage
#
ngstor = ['NG.NW2_EPG0_SWO_R48_BCF.W'] # w/ several series at a time ['ELEC.REV.AL-ALL.M', 'ELEC.REV.AK-ALL.M', 'ELEC.REV.CA-ALL.M']
stordata = EIAgov(tok, ngstor)
print(stordata.GetData())
# Natural Gas - Weekly Prices
#
ngpx = ['NG.RNGC1.W'] # w/ several series at a time ['ELEC.REV.AL-ALL.M', 'ELEC.REV.AK-ALL.M', 'ELEC.REV.CA-ALL.M']
pxdata = EIAgov(tok, ngpx)
print(pxdata.GetData())
Note that 'mytoken' needs to be replaced by an eia.gov API key. I can get this to successfully create an output of two lists...but then to get the lists merged I tried to add this at the end:
joined_frame = pd.concat([ngstor, ngpx], axis = 1, sort=False)
print(joined_frame.GetData())
But I get an error
("TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid")
because apparently I don't know the difference between a list and a series.
How do I merge these lists by date column? Thanks very much for any help. (Also feel free to advise why I am terrible at formatting code correctly in this post.)
If you want to manipulate them as DataFrames in the rest of your code, you can transform ngstor and ngpx into DataFrames as follows:
import pandas as pd
# I create two lists that look like yours
ngstor = [[1,2], ["2020-04-03", "2020-05-07"]]
ngpx = [[3,4] , ["2020-04-03", "2020-05-07"]]
# I transform them to DataFrames
ngstor = pd.DataFrame({"value1": ngstor[0],
"date_col": ngstor[1]})
ngpx = pd.DataFrame({"value2": ngpx[0],
"date_col": ngpx[1]})
Then you can either use pandas.merge or pandas.concat :
# merge option
joined_framed = pd.merge(ngstor, ngpx, on="date_col",
how="outer")
# concat option
ngstor = ngstor.set_index("date_col")
ngpx = ngpx.set_index("date_col")
joined_framed = pd.concat([ngstor, ngpx], axis=1,
join="outer").reset_index()
The result will be:
date_col value1 value2
0 2020-04-03 1 3
1 2020-05-07 2 4
I would like to find out the row which meets the condition RSI < 25.
However, the result is generated with one data frame. Is it possible to create separate dataframes for any single row?
Thanks.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas_datareader import data as wb
stock='TSLA'
ck_df = wb.DataReader(stock,data_source='yahoo',start='2015-01-01')
rsi_period = 14
chg = ck_df['Close'].diff(1)
gain = chg.mask(chg<0,0)
ck_df['Gain'] = gain
loss = chg.mask(chg>0,0)
ck_df['Loss'] = loss
avg_gain = gain.ewm(com = rsi_period-1,min_periods=rsi_period).mean()
avg_loss = loss.ewm(com = rsi_period-1,min_periods=rsi_period).mean()
ck_df['Avg Gain'] = avg_gain
ck_df['Avg Loss'] = avg_loss
rs = abs(avg_gain/avg_loss)
rsi = 100-(100/(1+rs))
ck_df['RSI'] = rsi
RSIFactor = ck_df['RSI'] <25
ck_df[RSIFactor]
If you want to know at what index the RSI < 25 then just use:
ck_df[ck_df['RSI'] <25].index
The result will also be a dataframe. If you insist on making a new one then:
new_df = ck_df[ck_df['RSI'] <25].copy()
To split the rows found by #Omkar's solution into separate dataframes you might use this function taken from here: Pandas: split dataframe into multiple dataframes by number of rows;
def split_dataframe_to_chunks(df, n):
df_len = len(df)
count = 0
dfs = []
while True:
if count > df_len-1:
break
start = count
count += n
dfs.append(df.iloc[start : count])
return dfs
With this you get a list of dataframes.
My process is this:
Import csv of data containing dates, activations, and cancellations
subset the data by activated or cancelled
pivot the data with aggfunc 'sum'
convert back to data frames
Now, I need to merge the 2 data frames together but there are dates that exist in one data frame but not the other. Both data frames start Jan 1, 2017 and end Dec 31, 2017. Preferably, the output for any observation in which the index month needs to be filled with have a corresponding value of 0.
Here's the .head() from both data frames:
For reference, here's the code up to this point:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime
%matplotlib inline
#import data
directory1 = "C:\python\Contracts"
directory_source = os.path.join(directory1, "Contract_Data.csv")
df_source = pd.read_csv(directory_source)
#format date ranges as times
#df_source["Activation_Month"] = pd.to_datetime(df_source["Activation_Month"])
#df_source["Cancellation_Month"] = pd.to_datetime(df_source["Cancellation_Month"])
df_source["Activation_Day"] = pd.to_datetime(df_source["Activation_Day"])
df_source["Cancellation_Day"] = pd.to_datetime(df_source["Cancellation_Day"])
#subset the data based on status
df_active = df_source[df_source["Order Status"]=="Active"]
df_active = pd.DataFrame(df_active[["Activation_Day", "Event_Value"]].copy())
df_cancelled = df_source[df_source["Order Status"]=="Cancelled"]
df_cancelled = pd.DataFrame(df_cancelled[["Cancellation_Day", "Event_Value"]].copy())
#remove activations outside 2017 and cancellations outside 2017
df_cancelled = df_cancelled[(df_cancelled['Cancellation_Day'] > '2016-12-31') &
(df_cancelled['Cancellation_Day'] <= '2017-12-31')]
df_active = df_active[(df_active['Activation_Day'] > '2016-12-31') &
(df_active['Activation_Day'] <= '2017-12-31')]
#pivot the data to aggregate by day
df_active_aggregated = df_active.pivot_table(index='Activation_Day',
values='Event_Value',
aggfunc='sum')
df_cancelled_aggregated = df_cancelled.pivot_table(index='Cancellation_Day',
values='Event_Value',
aggfunc='sum')
#convert pivot tables back to useable dataframes
activations_aggregated = pd.DataFrame(df_active_aggregated.to_records())
cancellations_aggregated = pd.DataFrame(df_cancelled_aggregated.to_records())
#rename the time columns so they can be referenced when merging into one DF
activations_aggregated.columns = ["index_month", "Activations"]
#activations_aggregated = activations_aggregated.set_index(pd.DatetimeIndex(activations_aggregated["index_month"]))
cancellations_aggregated.columns = ["index_month", "Cancellations"]
#cancellations_aggregated = cancellations_aggregated.set_index(pd.DatetimeIndex(cancellations_aggregated["index_month"]))
I'm aware there are many posts that address issues similar to this but I haven't been able to find anything that has helped. Thanks to anyone that can give me a hand with this!
You can try:
activations_aggregated.merge(cancellations_aggregated, how='outer', on='index_month').fillna(0)
i am trying to create a set of day-of-week boxplots for a timeseries (e.g. 5-minute temperature observations).
My code:
# ts is our timeseries
ts = df.SomeColumn
dow_map = {}
days = ['MON','TUE','WED','THU','FRI','SAT','SUN']
dow_idx = ts.index.dayofweek
i = 0
for d in days:
dow_map[d] = ts[dow_idx == i]
i = i + 1
df = pd.DataFrame(dow_map)
df.boxplot()
results in:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-898-6070c45e4c4b> in <module>()
41 i = i + 1
42
---> 43 df = pd.DataFrame(dow_map)
44 df.boxplot()
...
Exception: Reindexing only valid with uniquely valued Index objects
I did find succcess by creating DataFrames for each day-of-week and then concat-ing them into a final DataFrame, but this seems inefficient...
1st Create data frame and use weekdays method to get days of week:
import pandas as pd
import numpy.random as random
n=1000
df = pd.DataFrame(random.randn(n), pd.date_range('2010-01-01', periods=n), columns=["data"])
df['Dates'] = df.index
df['week_days'] =df.index.weekday
df
now pivot that table so that the week_days are as columns (could also change the needdays to string formats of days but leaving that for you.
x =df.pivot(index='Dates', columns='week_days', values='data')
x.boxplot()
import locale, calendar
# for example pl_PL
locale.setlocale(locale.LC_ALL, 'pl_PL.UTF-8')
x = x.rename_axis(lambda x: calendar.day_abbr[x].capitalize())