How to created pandasData for backtrader with ccxt - python

I am trying to back test a strategy using backtrader. Most of the examples I have seen only are using a csv file. I was wondering if it is possible to just get data from an exchange and turn it into pandas dataframe and then use backtrader? When I run it I get an error AttributeError: 'numpy.int64' object has no attribute 'lower' which is refers to pandafeed.py
import ccxt,os
from dotenv import load_dotenv
import backtrader
class Trader:
def __init__(self) -> None:
load_dotenv()
self.connect()
""" Creates Binance client """
def connect(self):
self.exchange = ccxt.binance({
'apiKey': os.getenv('BINANCE_API_KEY'),
'secret': os.getenv('BINANCE_API_SECRET')
})
klines = Trader().exchange.fetch_ohlcv(symbol=trading_pair,timeframe=interval)
dataFrame = pd.DataFrame(klines)
dataFrame[0] = [datetime.fromtimestamp(t/1000) for t in dataFrame[0]]
data = backtrader.feeds.PandasData(dataname=dataFrame)
cerebro = backtrader.Cerebro()
cerebro.broker.set_cash(10000)
cerebro.adddata(data)
cerebro.run()
If I use a column name and change my code to the below
colums = ['datetime', 'open','high', 'low', 'close', 'volume']
dataFrame = pd.DataFrame(klines, columns=colums)
dataFrame["datetime"] = [datetime.fromtimestamp(t/1000) for t in dataFrame["datetime"]]
data = backtrader.feeds.PandasData(dataname=dataFrame)
I get this error
AttributeError: 'int' object has no attribute 'to_pydatetime'
pandafeed.py
My question is:
how do I turn a list into something which I can use to run backtrader? thank you.
P.S. an example data structure return by klines will be like
[
[1621152000000, 49375.28, 49795.89, 48656.0, 49014.99, 10956.006583],
[1621166400000, 49014.99, 49249.06, 47566.01, 47727.26, 14166.961995],
[1621180800000, 47727.26, 48097.59, 44444.44, 45549.26, 36819.653456],
[1621195200000, 45553.24, 46480.0, 43825.39, 46431.5, 28724.055984],
[1621209600000, 46426.83, 46686.0, 42777.0, 42915.46, 28171.858447],
[1621224000000, 42915.46, 45400.0, 42196.97, 45149.18, 40557.45817],
[1621238400000, 45143.28, 45800.0, 44291.84, 45731.39, 23851.50751],
[1621252800000, 45733.55, 45791.04, 43156.0, 43362.75, 23137.989315],
[1621267200000, 43357.0, 44400.0, 42001.0, 44197.73, 30883.162039],
[1621281600000, 44197.73, 44939.2, 42500.0, 43538.04, 20055.197255],
[1621296000000, 43538.02, 45281.34, 43150.79, 44779.83, 19252.919453],
[1621310400000, 44774.78, 45799.29, 44738.26, 45172.7, 17218.430549],
[1621324800000, 45172.69, 45420.0, 44607.08, 45225.71, 8427.020047]
]

I think backtrader is trying to read headers where there are none. Try telling PandasData there are no headers. See the docs
data = backtrader.feeds.PandasData(dataname=dataFrame, header=None)

I got the same issue, and after debug it i found it append because backtrader expect your dataframe to be indexed by the datetime (to make it simple).
Then to solve it, just add to your code:
...
dataFrame = dataFrame.set_index('datetime')
...
In my case I also had also to change the type of the 'datetime' field and I did by:
...
dataFrame["datetime"] = dataFrame["datetime"].values.astype(dtype='datetime64[ms]')
dataFrame = dataFrame.set_index('datetime')
...
I hope that will help you, even if you asked that question 2 months ago.

Related

How to solve great expectations "MetricResolutionError: Cannot compile Column object until its 'name' is assigned." Error?

I am trying to use great expectations, The function i want to use is "expect_compound_columns_to_be_unique".
This is the code (main code - template);
import datetime
import pandas as pd
import great_expectations as ge
import great_expectations.jupyter_ux
from great_expectations.core.batch import BatchRequest
from great_expectations.checkpoint import SimpleCheckpoint
from great_expectations.exceptions import DataContextError
context = ge.data_context.DataContext()
# Note that if you modify this batch request, you may save the new version as a .json file
# to pass in later via the --batch-request option
batch_request = {'datasource_name': 'impala_okh', 'data_connector_name': 'default_inferred_data_connector_name', 'data_asset_name': 'okh.okh_forecast_prod', 'limit': 1000}
# Feel free to change the name of your suite here. Renaming this will not remove the other one.
expectation_suite_name = "okh_forecast_prod"
try:
suite = context.get_expectation_suite(expectation_suite_name=expectation_suite_name)
print(f'Loaded ExpectationSuite "{suite.expectation_suite_name}" containing {len(suite.expectations)} expectations.')
except DataContextError:
suite = context.create_expectation_suite(expectation_suite_name=expectation_suite_name)
print(f'Created ExpectationSuite "{suite.expectation_suite_name}".')
validator = context.get_validator(
batch_request=BatchRequest(**batch_request),
expectation_suite_name=expectation_suite_name
)
column_names = [f'"{column_name}"' for column_name in validator.columns()]
print(f"Columns: {', '.join(column_names)}.")
validator.head(n_rows=5, fetch_all=False)
the function (error in here);
validator.expect_compound_columns_to_be_unique(['column1', 'column2'])
Then i am getting following error;
MetricResolutionError: Cannot compile Column object until its 'name' is assigned.
How can i solve this problem?

TypeError: 'NoneType' object is not subscriptable for JSON parsing

I am trying to extract event's fee information from this website from pages 1 to 20 using python. The event's fee is from external URL. Therefore, i need to parse it using json load and extract the pattern from the json file. I have try the code from this post and get the following error TypeError: 'NoneType' object is not subscriptable. Based on my research, it means that the object equal to None and therefore, not subscriptable.
I have try to assign the object to 'NA' if value is None but still no successful. I would appreciate if anyone can kindly explain. Below is the code that i have try:
import re
import json
import requests
event_fees = []
for i in range(20):
urls = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-events/?page=' + str(i)
events_url = 'https://www.eventbrite.com/api/v3/destination/events/?event_ids={event_ids}&expand=event_sales_status,primary_venue,image,saves,my_collections,ticket_availability&page_size=99999'
html_text = requests.get(urls).text
data1 = json.loads( re.search(r'window\.__SERVER_DATA__ = ({.*});', html_text).group(1) )
event_ids = ','.join(r['id'] for r in data1['search_data']['events']['results'])
data2 = requests.get(events_url.format(event_ids=event_ids)).json()
for e in data2['events']:
fees = (e['ticket_availability']['minimum_ticket_price']['display'],'to',e['ticket_availability']['maximum_ticket_price']['display'])
if fees is None:
event_fees.append("NA")
else:
event_fees.append(fees)
Check if there is a key in that object before use subscription(obj['key']). In this case data2[6]['ticket_availability']['minimum_ticket_price'] is None.

Most efficient way to Twitter Stream?

My partner and I started learning Python at the beginning of the year. I am at the point where a) my partner and I are almost finished with our code, but b) are pulling our hair out trying to get it to work.
Assignment: Pull 250 tweets based on a certain topic, geocode location of tweets, analyze based on sentiment, then display them on a web-map. We have accomplished almost all of that except the 250 tweets requirement.
And I do not know how to pull the tweets more efficiently. The code works, but it writes around seven-twelve rows of information onto a CSV before it times out.
I tried setting a tracking parameter, but received this error: TypeError: 'NoneType' object is not subscriptable'
I tried expanding the locations parameter to stream.filter(locations=[-180,-90,180,90]), but received the same problem: TypeError: 'NoneType' object has no attribute 'latitude'
I really do not know what I am missing and I was wondering if anyone has any ideas.
CODE BELOW:
from geopy import geocoders
from geopy.exc import GeocoderTimedOut
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from textblob import TextBlob
import json
import csv
def geo(location):
g = geocoders.Nominatim(user_agent='USER')
if location is not None:
loc = g.geocode(location, timeout=None)
if loc.latitude and loc.longitude is not None:
return loc.latitude, loc.longitude
def WriteCSV(user, text, sentiment, lat, long):
f = open('D:/PATHWAY/TO/tweets.csv', 'a', encoding="utf-8")
write = csv.writer(f)
write.writerow([user, text, sentiment, lat, long])
f.close()
CK = ''
CS = ''
AK = ''
AS = ''
auth = tweepy.OAuthHandler(CK, CS)
auth.set_access_token(AK, AS)
#By setting these values to true, our code will automatically wait as it hits its limits
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#Now I'm going to set up a stream listener
#https://stackoverflow.com/questions/20863486/tweepy-streaming-stop-collecting-tweets-at-x-amount
#https://wafawaheedas.gitbooks.io/twitter-sentiment-analysis-visualization-tutorial/sentiment-analysis-using-textblob.html
class StdOutListener(tweepy.StreamListener):
def __init__(self, api=None):
super(StdOutListener, self).__init__()
self.num_tweets = 0
def on_data(self, data):
Data = json.loads(data)
Author = Data['user']['screen_name']
Text = Data['text']
Tweet = TextBlob(Data["text"])
Sentiment = Tweet.sentiment.polarity
x,y = geo(Data['place']['full_name'])
if "coronavirus" in Text:
WriteCSV(Author, Text, Sentiment, x,y)
self.num_tweets += 1
if self.num_tweets < 50:
return True
else:
return False
stream = tweepy.Stream(auth=api.auth, listener=StdOutListener())
stream.filter(locations=[-122.441, 47.255, -122.329, 47.603])
The Twitter and Geolocation API returns all kinds of data. Some of the fields may be missing.
TypeError: 'NoneType' object has no attribute 'latitude'
This error comes from here:
loc = g.geocode(location, timeout=None)
if loc.latitude and loc.longitude is not None:
return loc.latitude, loc.longitude
You provide a location and it searches for such location but it cannot find that location. So it writes into loc None.
Consequently loc.latitude won't work because loc is None.
You should check loc first before accessing any of its attributes.
x,y = geo(Data['place']['full_name'])
I know you are filtering tweets by location and consequently your Twitter Status object should have Data['place']['full_name']. But this is not always the case. You should check if the key really do exist before accessing the values.
This applies generally and should be applied to your whole code. Write robust code. You will have a bit of easier time debugging mistakes if you implement some try catch and print out the objects to see how they are built. Maybe set a breakpoint in your catch and do some live inspection.

Writting data from pubsub to bigtable via cloud functions

I am a beginner at cloud big table and have big issues using cloud functions writing data from pub/sub to bigtable.
Cloud functions gets the messages from pubsub, but the issue is in the next step, writing it into bigtable.
The message is created in a python script and sent to pub/sub.
One example for a message:
b'{"eda":2.015176,"temperature":33.39,"bvp":-0.49,"x_acc":-36.0,"y_acc":-38.0,"z_acc":-128.0,"heart_rate":83.78,"iddevice":15.0,"timestamp":"2019-12-01T20:01:36.927Z"}'
For writing it into bigtable I created a table:
from google.cloud import bigtable
from google.cloud.bigtable import column_family
client = bigtable.Client(project="projectid", admin=True)
instance = client.instance("bigtableinstance")
table = instance.table("bigtable1")
print('Creating the {} table.'.format(table))
print('Creating columnfamily cf1 with Max Version GC rule...')
max_versions_rule = column_family.MaxVersionsGCRule(2)
column_family_id = 'cf1'
column_families = {column_family_id: max_versions_rule}
if not table.exists():
table.create(column_families=column_families)
print("Table {} is created.".format(table))
else:
print("Table {} already exists.".format(table))
This works without problems.
Now I tried to write the message via pub/sub to bigtable with the following python code in cloud functions using the main method:
import json
import base64
import os
from google.cloud import bigtable
from google.cloud.bigtable import column_family, row_filters
project_id = os.environ.get('projetid', 'UNKNOWN')
INSTANCE = 'bigtableinstance'
TABLE = 'bigtable1'
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(INSTANCE)
colFamily = "cf1"
def writeToBigTable(table, data):
# Parameters row_key (bytes) – The key for the row being created.
# Returns A row owned by this table.
row_key = data[colFamily]['iddevice'].value.encode()
row = table.row(row_key)
for colFamily in data.keys():
for key in data[colFamily].keys():
row.set_cell(colFamily,
key,
data[colFamily][key])
table.mutate_rows([row])
return data
def selectTable():
stage = os.environ.get('stage', 'dev')
table_id = TABLE + stage
table = instance.table(table_id)
return table
def main(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
print("DATA: {}".format(data))
eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp = data.split(',')
table = selectTable()
data = {'eda': eda,
'temperature': temperature,
'bvp': bvp,
'x_acc':x_acc,
'y_acc':y_acc,
'z_acc':z_acc,
'heart_rate':heart_rate,
'iddevice':iddevice,
'timestamp':timestamp}
writeToBigTable(table, data)
print("Data Written: {}".format(data))
I tried different versions but cannot find a solution.
Thanks for the help.
All the best
Dominik
I think this line is wrong:
row_key = data[colFamily]['iddevice'].value.encode()
You're passing in the data object, but it doesn't have a 'cf1' property. You also don't have to encode it. Give this a try:
row_key = data['iddevice']
Your for loop will also have the same issue. I think this is what you want instead
for col in data.keys():
row.set_cell(colFamily, key, data[key])
Also, I know you're just playing with it, but using a device id as the only value for a rowkey will end up poorly. What is recommended might be to combine the rowkey and the date or one of your other properties (depending on your query,) and use that as your rowkey. There is a document on Cloud Bigtable schema that is helpful, and a codelab using a more realistic sample dataset and walks through how to pick a schema for that example. It's in Java, but you can still import the data and run your own queries.
first thanks a lot for the help.
I tried to fix it with you code recommendation which is , but unfortunately it doesn't work now due to other errors.
AttributeError: 'DirectRow' object has no attribute 'append'
I guess this is within the following line of code
row.set_cell(colFamily,
key,
data[key])
I could imagine that the errors origin is in the split of the string "data"
eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp = data.split(',')
E.g. eda would look like this:
"'eda':2.015176"
which looks pretty wrong to me.
Especially when I insert it into the following dict:
data = {'eda': eda,....}
The error
AttributeError: 'DirectRow' object has no attribute 'append'
seems to say, that there is a problem with the data I want to process with set_cell. There is said set_cell with row as a list or any other iterable of Direct Row Instance. Shouldn't fit a dic for it?
I tried a workaround with a list, but this seems to make it even worse.
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(INSTANCE)
colFamily = "cf1"
def writeToBigTable(table, dat):
row_key = "{}-{}".format(dat[16], dat[17])
row = table.row(row_key)
for n in range(len(dat)):
row.set_cell(colFamily,
dat[n],
dat[n+9])
table.mutate_rows([row])
return dat
def selectTable():
stage = os.environ.get('stage', 'dev')
table_id = TABLE + stage
table = instance.table(table_id)
return table
def main(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
print("DATA: {}".format(data))
var_1, eda, var_2, temperature, var_3, bvp, var_4, x_acc, var_5, y_acc, var_6, z_acc, var_7, heart_rate, var_8, iddevice, var_9, timestamp = data.replace(':',',').split(',')
table = selectTable(); dat = [var_1, var_2, var_3, var_4, var_5, var_6, var_7, var_8, var_9, eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp];
# data = {'eda': eda,
# 'temperature': temperature,
# 'bvp': bvp,
# 'x_acc':x_acc,
# 'y_acc':y_acc,
# 'z_acc':z_acc,
# 'heart_rate':heart_rate,
# 'iddevice':iddevice,
# 'timestamp':timestamp}
writeToBigTable(table, dat)
print("Data Written: {}".format(data))
I am really hard stuck at this problem and have no further ideas how to solve it.

Reset index name in elasticsearch dsl

I'm trying to create an ETL that extracts from mongo, process the data and loads into elastic. I will do a daily load so I thought of naming my index with the current date. This will help me for a later processing I need to do with this first index.
I used elasticsearch dsl guide: https://elasticsearch-dsl.readthedocs.io/en/latest/persistence.html
The problem that I have comes from my little experience with working with classes. I don't know how to reset the Index name from the class.
Here is my code for the class (custom_indices.py):
from elasticsearch_dsl import Document, Date, Integer, Keyword, Text
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Search
import datetime
class News(Document):
title = Text(analyzer='standard', fields={'raw': Keyword()})
manual_tagging = Keyword()
class Index:
name = 'processed_news_'+datetime.datetime.now().strftime("%Y%m%d")
def save(self, ** kwargs):
return super(News, self).save(** kwargs)
def is_published(self):
return datetime.now() >= self.processed
And this is the part of the code where I create the instance to that class:
from custom_indices import News
import elasticsearch
import elasticsearch_dsl
from elasticsearch_dsl.connections import connections
import pandas as pd
import datetime
connections.create_connection(hosts=['localhost'])
News.init()
for index, doc in df.iterrows():
new_insert = News(meta={'id': doc.url_hashed},
title = doc.title,
manual_tagging = doc.customTags,
)
new_insert.save()
Every time I call the "News" class I would expect to have a new name. However, the name doesn't change even if I load the class again (from custom_indices import News). I know this is only a problem I have when testing but I'd like to know how to force that "reset". Actually, I originally wanted to change the name outside the class with this line right before the loop:
News.Index.name = "NEW_NAME"
However, that didn't work. I was still seeing the name defined on the class.
Could anyone please assist?
Many thanks!
PS: this must be just an object oriented programming issue. Apologies for my ignorance on the subject.
Maybe you could take advantage of the fact that Document.init() accepts an index keyword argument. If you want the index name to get set automatically, you could implement init() in the News class and call super().init(...) in your implementation.
A simplified example (python 3.x):
from elasticsearch_dsl import Document
from elasticsearch_dsl.connections import connections
import datetime
class News(Document):
#classmethod
def init(cls, index=None, using=None):
index_name = index or 'processed_news_' + datetime.datetime.now().strftime("%Y%m%d")
return super().init(index=index_name, using=using)
You can override the index when you call save() .
new_insert.save('processed_news_' + datetime.datetime.now().strftime("%Y%m%d"))
Example as following.
# coding: utf-8
import datetime
from elasticsearch_dsl import Keyword, Text, \
Index, Document, Date
from elasticsearch_dsl.connections import connections
HOST = "localhost:9200"
index_names = [
"foo-log-",
"bar-log-",
]
default_settings = {"number_of_shards": 4, "number_of_replicas": 1}
index_settings = {
"foo-log-": {
"number_of_shards": 40,
"number_of_replicas": 1
}
}
class LogDoc(Document):
level = Keyword(ignore_above=256)
date = Date(format="yyyy-MM-dd'T'HH:mm:ss.SSS")
hostname = Text(fields={'fields': Keyword(ignore_above=256)})
message = Text()
createTime = Date(format="yyyy-MM-dd'T'HH:mm:ss.SSS")
def auto_create_index():
'''自动创建ES索引'''
connections.create_connection(hosts=[HOST])
for day in range(3):
dt = datetime.datetime.now() + datetime.timedelta(days=day)
for index in index_names:
name = index + dt.strftime("%Y-%m-%d")
settings = index_settings.get(index, default_settings)
idx = Index(name=name)
idx.document(LogDoc)
idx.settings(**settings)
try:
idx.create()
except Exception as e:
print(e)
continue
print("create index %s" % name)
if __name__ == '__main__':
auto_create_index()

Categories

Resources