I'm having some strange things happen when converting strings to floats in a loop.
So when I use the following code exactly it writes:
[1468436874000, 0.00254071495719],
[1468528803000, 0.00341349353996],
[1468688596000, 0.000853373384991],
[1468871365000, 0.00256012015497],
It stops short, there should be about 30 lines more than that and those are the wrong calculations.
The function is:
def main():
minprice = pricelist('MIN')
maxprice = pricelist('MAX')
avgprice = pricelist('AVG')
avgsqft = sqftlist()
datelist = getdates()
index = fileparser()
with open('/home/andrew/Desktop/index3.htm', 'w') as file:
file.writelines(data[:index[0]])
for date, minprice, maxprice in zip(datelist, minprice, maxprice):
file.writelines('[%s, %s, %s],\n' % (date, minprice, maxprice))
file.writelines(data[index[1]:index[2]])
for date, avgprice in zip(datelist, avgprice):
file.writelines('[%s, %s],\n' % (date, avgprice))
file.writelines(data[index[3]:index[4]])
for date, avgprice, avgsqft in zip(datelist, avgprice, avgsqft):
file.writelines('[%s, %s],\n' % (date, ((float(avgprice))/(float(avgsqft)))))
file.writelines(data[index[5]:])
file.close()
The error is:
file.writelines('[%s, %s],\n' % (date, ((float(avgprice))/(float(avgsqft)))))
ValueError: could not convert string to float: .
Oddly, when I comment out the other for loops before it, the result is:
[1468436874000, 2.82644376127],
[1468528803000, 2.86702735915],
[1468688596000, 2.8546107764],
[1468871365000, 2.8546107764],
[1468871996000, 2.8546107764],
[1468919656000, 2.85383420662],
[1469004050000, 2.85189704903],
[1469116491000, 2.87361540168],
[1469189815000, 2.86059636119],
[1469276601000, 2.83694745621],
[1469367041000, 2.83903252711],
[1469547497000, 2.83848688853],
[1469649630000, 2.83803033196],
[1469736031000, 2.82327110329],
[1469790030000, 2.82650020338],
[1469876430000, 2.96552660866],
[1470022624000, 2.93407180385],
Moreover, when I use enumerate instead of zip (and make the appropriate changes), it works. I've examined both lists at the fifth item for anything unusual because that's where it's getting hung up, but there is nothing odd there in either list. Since it does work fine with enumerate I'll just do that for now. But I'm new to Python/programming in general and want to understand what exactly is causing this.
UPDATE Should have included this the first time.
# file.writelines(data[:index[0]+1])
# for date, minprice, maxprice in zip(datelist, minprice, maxprice):
# file.writelines('[%s, %s, %s],\n' % (date, minprice, maxprice))
# file.writelines(data[index[1]:index[2]+1])
# for date, avgprice in zip(datelist, avgprice):
# file.writelines('[%s, %s],\n' % (date, avgprice))
# file.writelines(data[index[3]:index[4]+1])
# time.sleep(1)
for date, avgprice, avgsqft in zip(datelist, avgprice, avgsqft):
# file.writelines(
print'[%s, %s],\n' % (date, ((float(avgprice))/(float(avgsqft))))
# file.writelines(data[index[5]:])
# file.close()
Prints... (correctly)
[1468436874000, 2.82644376127],
[1468528803000, 2.86702735915],
[1468688596000, 2.8546107764],
[1468871365000, 2.8546107764],
[1468871996000, 2.8546107764],
[1468919656000, 2.85383420662],
etc...
Debug by printing the values of avgprice and avgsqft in your code. You are getting some string as it's value which can not be converted to float
Related
I just switched from Matlab to python and even newer to the backtrader library for backtestingtrading strategies. My questions might seem obvious.
My problem seems similar to this :
https://community.backtrader.com/topic/2857/wanted-exit-long-and-open-short-on-the-same-bar-and-vice-versa
and this :
https://community.backtrader.com/topic/2797/self-close-does-not-clear-position
The code below is a simple MACD strategy.
Here is the code :
# -*- coding: utf-8 -*-
"""
"""
import backtrader as bt
import argparse
import backtrader.feeds as btFeeds
import numpy as np
import yfinance as yf
import pandas as pd
import talib
class SimpleMACDStrat(bt.Strategy):
def __init__(self):
#Keep a reference to the "close" line in the data[0] dataseries
self.dataclose = self.datas[0].close
self.order = None
def log(self, txt, dt=None):
dt = dt or self.datas[0].datetime.date(0)
print(f'{dt.isoformat()} {txt}')
#Print date and close
def notify_order(self, order):
if order.status in [order.Submitted, order.Accepted]:
# Buy/Sell order submitted/accepted to/by broker - Nothing to do
return
# Check if an order has been completed
# Attention: broker could reject order if not enough cash
if order.status in [order.Completed]:
if order.isbuy():
self.log('LONG EXECUTED, %.2f' % order.executed.price)
elif order.issell():
self.log('SELL EXECUTED, %.2f' % order.executed.price)
self.bar_executed = len(self)
elif order.status in [order.Canceled, order.Margin, order.Rejected]:
self.log('Order Canceled/Margin/Rejected')
# Write down: no pending order
self.order = None
def next(self):
self.log("Close: '{0}'" .format(self.data.adj_close[0]))
print('%f %f %f %f %f %f %f %f %f %f %f %f %f' % (self.data.Indexx[0],self.data.open[0],
self.data.high[0],self.data.low[0],
self.data.close[0],self.data.adj_close[0],
self.data.volume[0],self.data.EMA_100[0],
self.data.RSI[0], self.data.CCI[0],
self.data.MACD_macd[0],self.data.MACD_sign[0],self.data.MACD_hist[0]))
if self.order:
return
if self.data.MACD_hist[0]>0:
if self.position.size<0 and self.data.MACD_hist[-1]<0 :
self.close()
self.log('CLOSE SHORT POSITION, %.2f' % self.dataclose[0])
elif self.position.size==0:
self.order=self.buy()
self.log('OPEN LONG POSITION, %.2f' % self.dataclose[0])
elif self.data.MACD_hist[0]<0:
if self.position.size>0 and self.data.MACD_hist[-1]>0:
self.order=self.close()
self.log('CLOSE LONG POSITION, %.2f' % self.dataclose[0])
elif self.position.size==0:
self.order=self.sell()
self.log('OPEN SHORT POSITION, %.2f' % self.dataclose[0])
print('')
class BasicIndicatorsFeeded(btFeeds.PandasData):
lines = ('Indexx', 'adj_close', 'EMA_100', 'RSI', 'CCI', 'MACD_macd', 'MACD_sign', 'MACD_hist',)
params = ( ('Indexx', 0), ('adj_close', 5), ('volume', 6),
('EMA_100', 7), ('RSI', 8), ('CCI', 9),
('MACD_macd', 10), ('MACD_sign', 11), ('MACD_hist', 12),)
if __name__ == '__main__':
cerebro = bt.Cerebro()
#Add data feed to Cerebro
data1 = yf.download("AAPL",start="2021-08-09", end="2021-12-21",group_by="ticker")
data1.insert(0,'Indexx',' ')
data1['Indexx']=range(len(data1))
data1['EMA_100']=talib.EMA(data1['Adj Close'],100)
data1['RSI']=talib.RSI(data1['Adj Close'],14)
data1['CCI']=talib.CCI(data1['High'], data1['Low'], data1['Adj Close'], timeperiod=14)
data1['MACD_macd']=talib.MACD(data1['Adj Close'], fastperiod=12, slowperiod=26, signalperiod=9)[0]
data1['MACD_sign']=talib.MACD(data1['Adj Close'], fastperiod=12, slowperiod=26, signalperiod=9)[1]
data1['MACD_hist']=talib.MACD(data1['Adj Close'], fastperiod=12, slowperiod=26, signalperiod=9)[2]
# data1['Long_position']
# Run Cerebro Engine
cerebro.broker.setcash(8000000000)
start_portfolio_value = cerebro.broker.getvalue()
cerebro.addstrategy(SimpleMACDStrat)
data = BasicIndicatorsFeeded(dataname=data1)
cerebro.adddata(data)
cerebro.run()
cerebro.plot()
# print(data1)
print('-------------------')
#print('%f' %data)
# print(data)
end_portfolio_value = cerebro.broker.getvalue()
pnl = end_portfolio_value - start_portfolio_value
print(f'Starting Portfolio Value: {start_portfolio_value:2f}')
print(f'Final Portfolio Value: {end_portfolio_value:2f}')
print(f'PnL: {pnl:.2f}')
Here are the results :
results
On 2021-11-10, macd_hist goes from postive to negative. We are expecting that the next day( 2021-11-11):
a)the long position is closed and right after and
b)a short position is opened
1)We see that a) is actually closed the same day. Isn't it supposed to happen the next ?
2)Also a sell is executed the day after, which is not supposed to happen.
Any suggestion for 1) and 2) would be more then welcome. Thanks.
Abbe
EDIT :
Btw, I'm aware the idea can be coded that way (only def next) :
def next(self):
#print('%f' % (self.datas[0].Indexxx[0])
self.log("Close: '{0}'" .format(self.data.adj_close[0]))
print('%f %f %f %f %f %f %f %f %f %f %f %f %f' % (self.data.Indexx[0],self.data.open[0],
self.data.high[0],self.data.low[0],
self.data.close[0],self.data.adj_close[0],
self.data.volume[0],self.data.EMA_100[0],
self.data.RSI[0], self.data.CCI[0],
self.data.MACD_macd[0],self.data.MACD_sign[0],self.data.MACD_hist[0]))
if self.order:
return
print(self.position)
if self.data.MACD_hist[0]>0 and self.data.MACD_hist[-1]<0:
self.order=self.buy()
self.log('CLOSE SHORT POSITION and open long, %.2f' % self.dataclose[0])
if self.data.MACD_hist[0]<0 and self.data.MACD_hist[-1]>0:
self.order=self.sell()
self.log('CLOSE LONG POSITION and open short, %.2f' % self.dataclose[0])
print('')
But I really want to separate the
self.close()
and for instance the
self.buy()
That would allow me later to use different conditions for closing a position and opening one.
Thanks a lot for any inputs, ideas, remarks.
Abbe
In your code you are showing the following:
if self.data.MACD_hist[0]>0:
if self.position.size<0 and self.data.MACD_hist[-1]<0 :
self.close()
self.log('CLOSE SHORT POSITION, %.2f' % self.dataclose[0])
elif self.position.size==0:
self.order=self.buy()
self.log('OPEN LONG POSITION, %.2f' % self.dataclose[0])
elif self.data.MACD_hist[0]<0:
if self.position.size>0 and self.data.MACD_hist[-1]>0:
self.order=self.close()
self.log('CLOSE LONG POSITION, %.2f' % self.dataclose[0])
elif self.position.size==0:
self.order=self.sell()
self.log('OPEN SHORT POSITION, %.2f' % self.dataclose[0])
Based on the logic, it is only possible for one of these conditions to be met in next.
You are indicating that you would like to have the close and entry separate. You need to change your elif to if. Also, if you are using a criteria of self.position.size == 0, this will not happen until the close is executed, so the bar after the close, not the next one. But if you wish to have other criteria, you could enter it after another if statement.
if self.data.MACD_hist[0]>0:
if self.position.size<0 and self.data.MACD_hist[-1]<0 :
self.close()
self.log('CLOSE SHORT POSITION, %.2f' % self.dataclose[0])
#### change here ####
if SOME OTHER CONDITION:
self.order=self.buy()
self.log('OPEN LONG POSITION, %.2f' % self.dataclose[0])
...
Once you close the position, it is not really necessary to check if the position goes to 0 units. You can safely assume it will.
Just started using ib_insync. I am trying to get the tick data into a dataframe.
Here is the relevant code:
def onPendingTickers(tickers, conn=conn):
for t in tickers:
# 'CREATE TABLE IF NOT EXISTS {} (timestamp timestamp, bid_qty INT, bid REAL, ask REAL, ' \
# 'ask_qty INT, high REAL, low REAL, close REAL, open REAL, contractID INT)'
# print(t)
c.execute('INSERT INTO {} (timestamp, bid_qty, bid, ask, ask_qty, high, low, close, open, contractID)'
' VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);'.format(t.contract.pair()),
(t.time, t.bidSize, t.bid, t.ask, t.askSize, t.high, t.low, t.close, t.open, t.contract.conId))
# print(t.time, t.bidSize, t.bid, t.ask, t.askSize, t.high, t.low, t.close, t.open, t.contract.conId)
conn.commit()
ib.pendingTickersEvent += onPendingTickers
ib.sleep(60*60)
ib.pendingTickersEvent -= onPendingTickers
When I run this code in a terminal, it prints the ticker, I am not sure what exactly needs to be changed here.
If you just want to get ticks without displaying the information, here's some sample code that you should be able to run:
from ib_insync import *
import pandas as pd
import numpy as np
# Connect to IB; args are (IP address, device number, client ID)
def ibConnect(port,clientID):
connection = ib.connect('127.0.0.1', port, clientID)
ib.sleep(0)
return ()
# Disconnect from IB
def ibDisconnect():
ib.disconnect()
ib.sleep(0)
return
# Set up a futures contract
def ibFuturesContract(symbol, expirationDate, exchange):
futuresContract = Future(symbol, expirationDate, exchange)
return futuresContract
# Realtime Ticks Subscription
def ibGetTicker (contract):
ticker = ib.ticker(contract)
return [ticker]
ib = IB()
ibConnect(7496,300)
contract = ibFuturesContract('YM',20210618,'ECBOT')
# Start the real-time tick subscription
ib.reqMktData(contract, '', False, False)
# Real Time Ticks
global ticker
ticker = ibGetTicker(contract)
# Get just the last tick each second and put it into a data table
x = 0
while x < 10:
ib.sleep(1)
if ticker is not None:
df = util.df(ticker)
if (x == 0):
dt = df
else:
dt = dt.append(df)
x = x + 1
print (dt)
ib.cancelMktData(contract)
ibDisconnect()
I am running oracle 19c and I want to get the best insert performance I can. Currently, I insert using INSERT /*+APPEND */ ... which is fine, but not the speeds I wanted.
I read that using FORALL is a lot faster but I couldn't really find any examples.
here is the code snippet (python 3) :
connection = pool.acquire()
cursor = connection.cursor()
cursor.executemany("INSERT /*+APPEND*/ INTO RANDOM VALUES (:1, :2, :3)", list(random))
connection.commit()
cursor.close()
connection.close()
I really get interested in what would be faster, so I've tested some possibile ways to compare them:
simple executemany with no tricks.
the same with APPEND_VALUES hint inside the statement.
union all approach you've tried in another question. This should be slower than above since it generates a really very large statement (that potentially can require more network than the data itself). It then should be parsed at DB side that will also consume a lot of time and neglect all the benefits (not talking about potential size limit). Then I've executemany'ed it to test with chunks not to build a single statement for 100k records. I didn't use concatenation of values inside the statement, because wanted to keep it safe.
insert all. The same downsides, but no unions. Compare it with the union version.
serialize the data in JSON and do deserialization at DB side with json_table. Potentially good performance with single short statement and single data transfer with little overhead of JSON.
Your suggested FORALL in PL/SQL wrapper procedure. Should be the same as executemany since does the same, but at the database side. Overhead of transformation of the data into the collection.
The same FORALL, but with columnar approach to pass the data: pass simple lists of column values instead of complex type. Should be much faster than FORALL with collection since there's no need to serialize the data into collection's type.
I've used Oracle Autonomous Database in Oracle Cloud with free account. Each method was executed for 10 times in loop with the same input dataset of 100k records, table was recreated before each test. This is the result I've got. Preparation and execution times here are data transformation at client side end DB call itself respectively.
>>> t = PerfTest(100000)
>>> t.run("exec_many", 10)
Method: exec_many.
Duration, avg: 2.3083874 s
Preparation time, avg: 0.0 s
Execution time, avg: 2.3083874 s
>>> t.run("exec_many_append", 10)
Method: exec_many_append.
Duration, avg: 2.6031369 s
Preparation time, avg: 0.0 s
Execution time, avg: 2.6031369 s
>>> t.run("union_all", 10, 10000)
Method: union_all.
Duration, avg: 27.9444233 s
Preparation time, avg: 0.0408773 s
Execution time, avg: 27.8457551 s
>>> t.run("insert_all", 10, 10000)
Method: insert_all.
Duration, avg: 70.6442494 s
Preparation time, avg: 0.0289269 s
Execution time, avg: 70.5541995 s
>>> t.run("json_table", 10)
Method: json_table.
Duration, avg: 10.4648237 s
Preparation time, avg: 9.7907693 s
Execution time, avg: 0.621006 s
>>> t.run("forall", 10)
Method: forall.
Duration, avg: 5.5622837 s
Preparation time, avg: 1.8972456000000002 s
Execution time, avg: 3.6650380999999994 s
>>> t.run("forall_columnar", 10)
Method: forall_columnar.
Duration, avg: 2.6702698000000002 s
Preparation time, avg: 0.055710800000000005 s
Execution time, avg: 2.6105702 s
>>>
The fastest way is just executemany, not so much surprise. Interesting here is that APPEND_VALUES does not improve the query and gets more time on average, so this needs more investigation.
About FORALL: as expected, individual array for each column takes less time as there's no data preparation for it. It is more or less comparable with executemany, but I think PL/SQL overhead plays some role here.
Another interesting part for me is JSON: most of the time was spent on writing LOB into database and serialization, but the query itself was very fast. Maybe write operation can be improved in some way with chuncsize or some another way to pass LOB data into select statement, but as of my code it is far from very simple and straightforward approach with executemany.
There`re also possible approaches without Python that should be faster as native tools for external data, but I didn't tested them:
Oracle SQL*Loader
External table
Below is the code I've used for testing.
import cx_Oracle as db
import os, random, json
import datetime as dt
class PerfTest:
def __init__(self, size):
self._con = db.connect(
os.environ["ora_cloud_usr"],
os.environ["ora_cloud_pwd"],
"test_low",
encoding="UTF-8"
)
self._cur = self._con.cursor()
self.inp = [(i, "Test {i}".format(i=i), random.random()) for i in range(size)]
def __del__(self):
if self._con:
self._con.rollback()
self._con.close()
#Create objets
def setup(self):
try:
self._cur.execute("drop table rand")
#print("table dropped")
except:
pass
self._cur.execute("""create table rand(
id int,
str varchar2(100),
val number
)""")
self._cur.execute("""create or replace package pkg_test as
type ts_test is record (
id rand.id%type,
str rand.str%type,
val rand.val%type
);
type tt_test is table of ts_test index by pls_integer;
type tt_ids is table of rand.id%type index by pls_integer;
type tt_strs is table of rand.str%type index by pls_integer;
type tt_vals is table of rand.val%type index by pls_integer;
procedure write_data(p_data in tt_test);
procedure write_data_columnar(
p_ids in tt_ids,
p_strs in tt_strs,
p_vals in tt_vals
);
end;""")
self._cur.execute("""create or replace package body pkg_test as
procedure write_data(p_data in tt_test)
as
begin
forall i in indices of p_data
insert into rand(id, str, val)
values (p_data(i).id, p_data(i).str, p_data(i).val)
;
commit;
end;
procedure write_data_columnar(
p_ids in tt_ids,
p_strs in tt_strs,
p_vals in tt_vals
) as
begin
forall i in indices of p_ids
insert into rand(id, str, val)
values (p_ids(i), p_strs(i), p_vals(i))
;
commit;
end;
end;
""")
def build_union(self, size):
return """insert into rand(id, str, val)
select id, str, val from rand where 1 = 0 union all
""" + """ union all """.join(
["select :{}, :{}, :{} from dual".format(i*3+1, i*3+2, i*3+3)
for i in range(size)]
)
def build_insert_all(self, size):
return """
""".join(
["into rand(id, str, val) values (:{}, :{}, :{})".format(i*3+1, i*3+2, i*3+3)
for i in range(size)]
)
#Test case with executemany
def exec_many(self):
start = dt.datetime.now()
self._cur.executemany("insert into rand(id, str, val) values (:1, :2, :3)", self.inp)
self._con.commit()
return (dt.timedelta(0), dt.datetime.now() - start)
#The same as above but with prepared statement (no parsing)
def exec_many_append(self):
start = dt.datetime.now()
self._cur.executemany("insert /*+APPEND_VALUES*/ into rand(id, str, val) values (:1, :2, :3)", self.inp)
self._con.commit()
return (dt.timedelta(0), dt.datetime.now() - start)
#Union All approach (chunked). Should have large parse time
def union_all(self, size):
##Chunked list of big tuples
start_prepare = dt.datetime.now()
new_inp = [
tuple([item for t in r for item in t])
for r in list(zip(*[iter(self.inp)]*size))
]
new_stmt = self.build_union(size)
dur_prepare = dt.datetime.now() - start_prepare
#Execute unions
start_exec = dt.datetime.now()
self._cur.executemany(new_stmt, new_inp)
dur_exec = dt.datetime.now() - start_exec
##In case the size is not a divisor
remainder = len(self.inp) % size
if remainder > 0 :
start_prepare = dt.datetime.now()
new_stmt = self.build_union(remainder)
new_inp = tuple([
item for t in self.inp[-remainder:] for item in t
])
dur_prepare += dt.datetime.now() - start_prepare
start_exec = dt.datetime.now()
self._cur.execute(new_stmt, new_inp)
dur_exec += dt.datetime.now() - start_exec
self._con.commit()
return (dur_prepare, dur_exec)
#The same as union all, but with no need to union something
def insert_all(self, size):
##Chunked list of big tuples
start_prepare = dt.datetime.now()
new_inp = [
tuple([item for t in r for item in t])
for r in list(zip(*[iter(self.inp)]*size))
]
new_stmt = """insert all
{}
select * from dual"""
dur_prepare = dt.datetime.now() - start_prepare
#Execute
start_exec = dt.datetime.now()
self._cur.executemany(
new_stmt.format(self.build_insert_all(size)),
new_inp
)
dur_exec = dt.datetime.now() - start_exec
##In case the size is not a divisor
remainder = len(self.inp) % size
if remainder > 0 :
start_prepare = dt.datetime.now()
new_inp = tuple([
item for t in self.inp[-remainder:] for item in t
])
dur_prepare += dt.datetime.now() - start_prepare
start_exec = dt.datetime.now()
self._cur.execute(
new_stmt.format(self.build_insert_all(remainder)),
new_inp
)
dur_exec += dt.datetime.now() - start_exec
self._con.commit()
return (dur_prepare, dur_exec)
#Serialize at server side and do deserialization at DB side
def json_table(self):
start_prepare = dt.datetime.now()
new_inp = json.dumps([
{ "id":t[0], "str":t[1], "val":t[2]} for t in self.inp
])
lob_var = self._con.createlob(db.DB_TYPE_CLOB)
lob_var.write(new_inp)
start_exec = dt.datetime.now()
self._cur.execute("""
insert into rand(id, str, val)
select id, str, val
from json_table(
to_clob(:json), '$[*]'
columns
id int,
str varchar2(100),
val number
)
""", json=lob_var)
dur_exec = dt.datetime.now() - start_exec
self._con.commit()
return (start_exec - start_prepare, dur_exec)
#PL/SQL with FORALL
def forall(self):
start_prepare = dt.datetime.now()
collection_type = self._con.gettype("PKG_TEST.TT_TEST")
record_type = self._con.gettype("PKG_TEST.TS_TEST")
def recBuilder(x):
rec = record_type.newobject()
rec.ID = x[0]
rec.STR = x[1]
rec.VAL = x[2]
return rec
inp_collection = collection_type.newobject([
recBuilder(i) for i in self.inp
])
start_exec = dt.datetime.now()
self._cur.callproc("pkg_test.write_data", [inp_collection])
dur_exec = dt.datetime.now() - start_exec
return (start_exec - start_prepare, dur_exec)
#PL/SQL with FORALL and plain collections
def forall_columnar(self):
start_prepare = dt.datetime.now()
ids, strs, vals = map(list, zip(*self.inp))
start_exec = dt.datetime.now()
self._cur.callproc("pkg_test.write_data_columnar", [ids, strs, vals])
dur_exec = dt.datetime.now() - start_exec
return (start_exec - start_prepare, dur_exec)
#Run test
def run(self, method, iterations, *args):
#Cleanup schema
self.setup()
start = dt.datetime.now()
runtime = []
for i in range(iterations):
single_run = getattr(self, method)(*args)
runtime.append(single_run)
dur = dt.datetime.now() - start
dur_prep_total = sum([i.total_seconds() for i, _ in runtime])
dur_exec_total = sum([i.total_seconds() for _, i in runtime])
print("""Method: {meth}.
Duration, avg: {run_dur} s
Preparation time, avg: {prep} s
Execution time, avg: {ex} s""".format(
inp_s=len(self.inp),
meth=method,
run_dur=dur.total_seconds() / iterations,
prep=dur_prep_total / iterations,
ex=dur_exec_total / iterations
))
I've got this netcdf of weather data (one of thousands that require postgresql ingestion). I'm currently capable of inserting each band into a postgis-enabled table at a rate of about 20-23 seconds per band. (for monthly data, there is also daily data that i have yet to test.)
I've heard of different ways of speeding this up using COPY FROM, removing the gid, using ssds, etc... but I'm new to python and have no idea how to store the netcdf data to something I could use COPY FROM or what the best route might be.
If anyone has any other ideas on how to speed this up, please share!
Here is the ingestion script
import netCDF4, psycopg2, time
# Establish connection
db1 = psycopg2.connect("host=localhost dbname=postgis_test user=********** password=********")
cur = db1.cursor()
# Create Table in postgis
print(str(time.ctime()) + " CREATING TABLE")
try:
cur.execute("DROP TABLE IF EXISTS table_name;")
db1.commit()
cur.execute(
"CREATE TABLE table_name (gid serial PRIMARY KEY not null, thedate DATE, thepoint geometry, lon decimal, lat decimal, thevalue decimal);")
db1.commit()
print("TABLE CREATED")
except:
print(psycopg2.DatabaseError)
print("TABLE CREATION FAILED")
rawvalue_nc_file = 'netcdf_file.nc'
nc = netCDF4.Dataset(rawvalue_nc_file, mode='r')
nc.variables.keys()
lat = nc.variables['lat'][:]
lon = nc.variables['lon'][:]
time_var = nc.variables['time']
dtime = netCDF4.num2date(time_var[:], time_var.units)
newtime = [fdate.strftime('%Y-%m-%d') for fdate in dtime]
rawvalue = nc.variables['tx_max'][:]
lathash = {}
lonhash = {}
entry1 = 0
entry2 = 0
lattemp = nc.variables['lat'][:].tolist()
for entry1 in range(lat.size):
lathash[entry1] = lattemp[entry1]
lontemp = nc.variables['lon'][:].tolist()
for entry2 in range(lon.size):
lonhash[entry2] = lontemp[entry2]
for timestep in range(dtime.size):
print(str(time.ctime()) + " " + str(timestep + 1) + "/180")
for _lon in range(lon.size):
for _lat in range(lat.size):
latitude = round(lathash[_lat], 6)
longitude = round(lonhash[_lon], 6)
thedate = newtime[timestep]
thevalue = round(float(rawvalue.data[timestep, _lat, _lon] - 273.15), 3)
if (thevalue > -100):
cur.execute("INSERT INTO table_name (thedate, thepoint, thevalue) VALUES (%s, ST_MakePoint(%s,%s,0), %s)",(thedate, longitude, latitude, thevalue))
db1.commit()
cur.close()
db1.close()
print(" Done!")
If you're certain most of the time is spent in PostgreSQL, and not in any other code of your own, you may want to look at the fast execution helpers, namely cur.execute_values() in your case.
Also, you may want to make sure you're in a transaction, so the database doesn't fall back to an autocommit mode. ("If you do not issue a BEGIN command, then each individual statement has an implicit BEGIN and (if successful) COMMIT wrapped around it.")
Something like this could do the trick -- not tested though.
for timestep in range(dtime.size):
print(str(time.ctime()) + " " + str(timestep + 1) + "/180")
values = []
cur.execute("BEGIN")
for _lon in range(lon.size):
for _lat in range(lat.size):
latitude = round(lathash[_lat], 6)
longitude = round(lonhash[_lon], 6)
thedate = newtime[timestep]
thevalue = round(
float(rawvalue.data[timestep, _lat, _lon] - 273.15), 3
)
if thevalue > -100:
values.append((thedate, longitude, latitude, thevalue))
psycopg2.extras.execute_values(
cur,
"INSERT INTO table_name (thedate, thepoint, thevalue) VALUES %s",
values,
template="(%s, ST_MakePoint(%s,%s,0), %s)"
)
db1.commit()
I'm having trouble inserting data into my table. I have a list of stocks that I pass to the function getStockData.
I use a for loop to iterate through the list and get the data for each ticker symbol. At the end I put all the information into a dictionary. My final step is to insert the data into a table. I've been unsuccessful at inserting the data in the dictionary into my table.
def getStockData(x):
nowdate = raw_input("What Is Todays Date?: ")
print "Todays list has %d stocks on it\n" % len(x)
for stock in x:
stockPrice = ystockquote.get_price(stock)
stockPriceChange = ystockquote.get_change(stock)
originalPrice = float(stockPrice) + (float(stockPriceChange) * -1)
changePercentage = (float(stockPriceChange) / originalPrice) * 100
stockDict = {'Date': nowdate, 'Ticker Symbol': stock, 'Closing Price': stockPrice,
'Price Change': stockPriceChange, 'Percentage Changed': changePercentage}
conn = db.connect('stocks.db')
cursor = conn.cursor()
cursor.execute('insert into losers values (?, ?, ?, ?, ?)', (stockDict['Date'], stockDict['Ticker Symbol'], stockDict['Price Change'],
stockDict['Percentage Changed'], stockDict['Closing Price']) )
conn.close()
I think you forget to commit your data to your DB before close.
Try
conn.commit()