Persist datetime values for use with gviz api - python

I have an python application that collects data from an MQTT broker and presents it to a website via the gviz python API:
DESCRIPTION= [ \
('Zeit', 'datetime'), \
('Temperatur', 'number'), \
('Feuchtigkeit', 'number'),\
('Batterie', 'number') \
]
def sendAnswer(conn):
#protect against the data supplier
Mutex.acquire()
Trans = deepcopy(DatArr)
Mutex.release()
#create and populate the DataTable
data_table = gviz_api.DataTable(DESCRIPTION)
data_table.LoadData(Trans)
Answer = data_table.ToJSon()
#send ti to the webserver
lng = len(Answer)
try:
conn.sendall(bytes("L{:06d};".format(lng),"UTF-8"))
conn.sendall(bytes(Answer,"UTF-8"))
except BaseException:
# if anything goes wrong, try again next time
pass
def on_message(client, userdata, message):
global Last, DatArr
#get the data from the broker
cur = json.loads(str(message.payload, encoding='utf-8'))
if cur == Last and len(DatArr)>2 : return
now = datetime.now()
# protect against the webserver
Mutex.acquire()
#add the data
DatArr.append([now, cur["temp"], cur["hum"], cur["bat"]])
#cleanup old values
Last = cur
for i in range(len(DatArr)):
if now - DatArr[0][0] > timedelta(days=1):
DatArr.pop(0)
else:
break
Mutex.release()
This works, but instead of keeping the data in the python variable I want to persist it in a file (preferrably JSON). But I cannot JSON.dump() a datetime variable and I cannot .LoadData() a string into a gviz DataTable. The python gviz also lacks an "addRow()". Any suggestions?
Much thanks in advance!

Based on the answers to this question: JSON datetime between Python and JavaScript
I found a solution and implemented it in a python module:
import json
import datetime
class DateTimeJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return dict(nested_datetime=obj.isoformat())
else:
return super(DateTimeJSONEncoder, self).default(obj)
def datetime_decoder(d):
if len(d) == 1 and 'nested_datetime' in d:
return datetime.datetime.strptime(d['nested_datetime'], '%Y-%m-%dT%H:%M:%S.%f')
result = {}
for prop in d:
if isinstance(d[prop], dict):
result[prop] = datetime_decoder(d[prop])
else:
result[prop] = d[prop]
return result
The class and the function go as named parameters into the json.dump and json.load functions like this:
DatArr = json.load(DatFile, object_hook=djson.datetime_decoder)
and
json.dump(DatArr, DatFile, cls=djson.DateTimeJSONEncoder)
This persists the formerly global variable DatArr in the json file DatFile.
Thanks to all the posters to the above question for providing the information.

Related

How to get FX Forward Rates instead of Points in BLPAPI in Python

I am trying to get "CHF1M Curncy" as Forward Rates and not as points from Bloomberg API BLPAPI in Python. The Code itself works fine for the Forward Points, but as soon as I use the override to switch from Points to Rates, I get an error "No value for []" for the line with the first override. This is my code:
import blpapi
import json
HISTORICAL_DATA_RESPONSE = blpapi.Name("HistoricalDataResponse")
def historical_bloomberg_data(securities, fields, periodicity="DAILY", start_date="20190101", end_date="20190105"):
# Create and start a session
print("Creating session ...")
sessionOptions = blpapi.SessionOptions()
sessionOptions.setServerHost('localhost')
sessionOptions.setServerPort(8194)
session = blpapi.Session(sessionOptions)
if not session.start():
print("Failed to start session")
return
try:
# Create and open a service
print("Creating historical data service")
if not session.openService("//blp/refdata"):
print("Failed to create service")
return
refDataService = session.getService("//blp/refdata")
request = refDataService.createRequest("HistoricalDataRequest")
# Add all securities
for security in securities:
request.getElement("securities").appendValue(security)
# Add all fields
for field in fields:
request.getElement("fields").appendValue(field)
# Further settings
request.set("periodicitySelection", periodicity)
request.set("startDate", start_date)
request.set("endDate", end_date)
request.set("maxDataPoints", 2000)
#Override code
overrides = request.getElement('overrides')
override1 = overrides.appendElement()
override1.setElement('fieldID', 'FWD_CURVE_FORMAT')
override1.setElement('value', 'RATES')
#request.setOverride("FWD_CURVE_FORMAT", "RATES")
# Send and process request
print("Sending request...")
session.sendRequest(request)
results = {}
while True:
ev = session.nextEvent(500)
for msg in ev:
if msg.messageType() == HISTORICAL_DATA_RESPONSE:
#response = msg.getElement()
response = msg.getElement("securityData")
_sec = response.getElementAsString("security")
_data = [[fd.getElementAsString("date")] + [fd.getElementAsString(_) for _ in fields] for fd in response.getElement("fieldData").values()]
results[_sec] = results.get(_sec, []) + _data
if ev.eventType() == blpapi.Event.RESPONSE:
break
print("Results retrieved")
return results
The correct override is FWD_CURVE_QUOTE_FORMAT. The values can be:
POINTS
RATES
OUTRIGHT
(I suspect OUTRIGHT and RATES do the same thing)
EDIT
And the correct way to override a field is by using fieldId (ending with a small cap d) and not fieldID.

Flask loop takes long time to complete

I have this loop in my app.py. For some reason it extends the load time by over 3 seconds. Are there any solutions?
import dateutil.parser as dp
# Converts date from ISO-8601 string to formatted string and returns it
def dateConvert(date):
return dp.parse(date).strftime("%H:%M # %e/%b/%y")
def nameFromID(userID):
if userID is None:
return 'Unknown'
else:
response = requests.get("https://example2.org/" + str(userID), headers=headers)
return response.json()['firstName'] + ' ' + response.json()['lastName']
logs = []
response = requests.get("https://example.org", headers=headers)
for response in response.json():
logs.append([nameFromID(response['member']), dateConvert(response['createdAt'])])
It extends the load time by over 3 seconds because it does a lot of unnecessary work, that's why.
You're not using requests Sessions. Each request will require creating and tearing down an HTTPS connection. That's slow.
You're doing another HTTPS request for each name conversion. (See above.)
You're parsing the JSON you get in that function twice.
Whatever dp.parse() is (dateutil?), it's probably doing a lot of extra work parsing from a free-form string. If you know the input format, use strptime.
Here's a rework that should be significantly faster. Please see the TODO points first, of course.
Also, if you are at liberty to knowing the member id -> name mapping doesn't change, you can make name_cache a suitably named global variable too (but remember it may be persisted between requests).
import datetime
import requests
INPUT_DATE_FORMAT = "TODO_FILL_ME_IN" # TODO: FILL ME IN.
def dateConvert(date: str):
return datetime.datetime.strptime(date, INPUT_DATE_FORMAT).strftime(
"%H:%M # %e/%b/%y"
)
def nameFromID(sess: requests.Session, userID):
if userID is None:
return "Unknown"
response = sess.get(f"https://example2.org/{userID}")
response.raise_for_status()
data = response.json()
return "{firstName} {lastName}".format_map(data)
def do_thing():
headers = {} # TODO: fill me in
name_cache = {}
with requests.Session() as sess:
sess.headers.update(headers)
logs = []
response = sess.get("https://example.org")
for response in response.json():
member_id = response["member"]
name = name_cache.get(member_id)
if not name:
name = name_cache[member_id] = nameFromID(sess, member_id)
logs.append([name, dateConvert(response["createdAt"])])

Google cloud function using pubsub messages to start other GCF ends with 'status: connection error'

Situation:
I have 2 google cloud functions, let's call them gcf-composer and gcf-action.
I have a list of 70,000 unique dicts for which I want to execute the gcf-action.
I use the gcf-composer loop over all dicts and publish a message per dict to the gcf-action topic containing the dict as payload.
I need the gcf-composer, because running the gcf-action directly for all dicts would take more than the 9 min threshold.
I start off the gcf-composer using the google cloud-scheduler
Problem
When firing off the gcf-composer on the cloud, after an x-amount of seconds, it will stop and return the following:
'connection error'
These are the results of 4 separate tries.
Why does it give me the "finished with status: 'connection error'", and how do i solve it?
When I run this locally, so sending messages to topic, it works.
Please let me know if you need any more code or information!
Code of gcf-composer
from mlibs.pubsub import PubSubConnection
pubsub = PubSubConnection()
TOPIC_NAME = 'gcf-action-topic'
def gcf_composer(period, run_method='local', list_x, names_y):
"""Run composer given run method (local or cloud_fn)"""
for k, row in names_y.iterrows():
# get dict of identifiers
y = row.to_dict()
for x in list_x:
parameters = {'x': x, 'y': y}
if run_method == 'local':
c.test_local(x=x, y=y)
elif run_method == 'cloud-fn':
pubsub.publish_to_topic(topic_name=TOPIC_NAME, params={'params': parameters})
else:
print(f'Unknown run method {run_method} used. Please try again.')
PubSubConnection:
"""Interaction with the Pub/Sub Engine"""
from google.oauth2 import service_account
from google.cloud import pubsub_v1
from mlibs.utils import json
from mlibs.utils import decode
from mlibs.utils import files as fs
class PubSubConnection:
def __init__(self):
"""Initiate a PubSub connection"""
self.project_id = None
self.publisher = None
self.count = 0
self.init_connection()
def init_connection(self):
"""Initiates a connection given the service account"""
self.publisher = pubsub_v1.PublisherClient(credentials=*****)
self.project_id = credentials.project_id
def publish_to_topic(self, topic_name, params):
# Define the topic path
topic_path = self.publisher.topic_path(self.project_id, topic_name)
# Convert to ByteString
params_bytes = json.dumps(params).encode('utf-8')
# Publish and handle the Future
cbl = Callable(self.count, params)
message_future = self.publisher.publish(topic_path, data=params_bytes, test='pubsub')
# Done callback
message_future.add_done_callback(cbl.callback)
# https://googleapis.dev/python/pubsub/latest/publisher/index.html#futures
# block result
# message_id = message_future.result()
self.count = self.count + 1
print(f'[pubsub][{self.count}][{topic_name}]')
class Callable:
def __init__(self, count, params):
self.count = count
self.params = params
def callback(self, message_future):
if message_future.exception(timeout=30):
print(f'[pubsub-except] Publishing message threw an Exception {message_future.exception()}')
else:
print(f'[pubsub][{self.count}][{message_future.result()}] {self.params}')

Data gets mixed up while trying to transfer it to arangodb

I'm trying to transfer ca. 10GB of json data (tweets in my case) to a collection in arangodb. I'm also trying to use joblib for it:
from ArangoConn import ArangoConn
import Userdata as U
import encodings
from joblib import Parallel,delayed
import json
from glob import glob
import time
def progress(total, prog, start, stri = ""):
if(prog == 0):
print("")
prog = 1;
perc = prog / total
diff = time.time() - start
rem = (diff / prog) * (total - prog)
bar = ""
for i in range(0,int(perc*20)):
bar = bar + "|"
for i in range(int(perc*20),20):
bar = bar + " "
print("\r"+"progress: " + "[" + bar + "] " + str(prog) + " of " +
str(total) + ": {0:.1f}% ".format(perc * 100) + "- " +
time.strftime("%H:%M:%S", time.gmtime(rem)) + " " + stri, end="")
def processfile(filepath):
file = open(filepath,encoding='utf-8')
s = file.read()
file.close()
data = json.loads(s)
Parallel(n_jobs=12, verbose=0, backend="threading"
(map(delayed(ArangoConn.createDocFromObject), data))
files = glob(U.path+'/*.json')
i = 1
j = len(files)
starttime = time.time()
for f in files:
progress(j,i,starttime,f)
i = i+1
processfile(f)
and
from pyArango.connection import Connection
import Userdata as U
import time
class ArangoConn:
def __init__(self,server,user,pw,db,collectionname):
self.server = server
self.user = user
self.pw = pw
self.db = db
self.collectionname = collectionname
self.connection = None
self.dbHandle = self.connect()
if not self.dbHandle.hasCollection(name=self.collectionname):
coll = self.dbHandle.createCollection(name=collectionname)
else:
coll = self.dbHandle.collections[collectionname]
self.collection = coll
def db_createDocFromObject(self, obj):
data = obj.__dict__()
doc = self.collection.createDocument()
for key,value in data.items():
doc[key] = value
doc._key= str(int(round(time.time() * 1000)))
doc.save()
def connect(self):
self.connection = Connection(arangoURL=self.server + ":8529",
username=self.user, password=self.pw)
if not self.connection.hasDatabase(self.db):
db = self.connection.createDatabase(name=self.db)
else:
db = self.connection.databases.get(self.db)
return db
def disconnect(self):
self.connection.disconnectSession()
def getAllData(self):
docs = []
for doc in self.collection.fetchAll():
docs.append(self.doc_to_result(doc))
return docs
def addData(self,obj):
self.db_createDocFromObject(obj)
def search(self,collection,search,prop):
docs = []
aql = """FOR q IN """+collection+""" FILTER q."""+prop+""" LIKE
"%"""+search+"""%" RETURN q"""
results = self.dbHandle.AQLQuery(aql, rawResults=False, batchSize=1)
for doc in results:
docs.append(self.doc_to_result(doc))
return docs
def doc_to_result(self,arangodoc):
modstore = arangodoc.getStore()
modstore["_key"] = arangodoc._key
return modstore
def db_createDocFromJson(self,json):
for d in json:
doc = self.collection.createDocument()
for key,value in d.items():
doc[key] = value
doc._key = str(int(round(time.time() * 1000)))
doc.save()
#staticmethod
def createDocFromObject(obj):
c = ArangoConn(U.url, U.user, U.pw, U.db, U.collection)
data = obj
doc = c.collection.createDocument()
for key, value in data.items():
doc[key] = value
doc._key = doc["id"]
doc.save()
c.connection.disconnectSession()
It kinda works like that. My problem is that the data that lands in the database is somehow mixed up.
as you can see in the screenshot "id" and "id_str" are not the same - as they should be.
what i investigated so far:
I thought that at some points the default keys in the databese may "collide"
because of the threading so I set the key to the tweet id.
I tried to do it without multiple threads. the threading doesn't seem to be
the problem
I looked at the data I send to the database... everything seems to be fine
But as soon as I communicate with the db the data mixes up.
My professor thought that maybe something in pyarango isn't threadsafe and it messes up the data but I don't think so as threading doesn't seem to be the problem.
I have no ideas left where this behavior could come from...
Any ideas?
The screenshot shows the following values:
id : 892886691937214500
id_str : 892886691937214465
It looks like somewhere along the way the value is converted to an IEEE754 double, which cannot safely represent the latter value. So there is potentially some precision loss due to conversion.
A quick example in node.js (JavaScript is using IEEE754 doubles for any number values greater than 0xffffffff) shows that this is likely the problem cause:
$ node
> 892886691937214500
892886691937214500
> 892886691937214465
892886691937214500
So the question is where the conversion does happen. Can you check whether the python client program is correctly sending the expected values to ArangoDB, or does it already send the converted/truncated values?
In general, any integer number that exceeds 0x7fffffffffffffff will be truncated when stored in ArangoDB, or converted to an IEEE754 double. This can be avoided by storing the number values inside a string, but of course comparing two number strings will produce different results than comparing two numbers (e.g. "10" < "9" vs. 10 > 9).

How to properly implement threading while writing out to a csv?

I'm pulling commit data from the Gerrit API, and the commit number is in the 226,000 range. Where I have to make a request to an endpoint for each and every commit, this is understandable taking a long time. I was wondering how I could best implement threading into my current process.
I have two classes, a Project class, which drills down and retrieves all commits associated with it, and saves them out as a Commit object that contains all the information necessary to then loop through and get the json associated with it. I am pulling them all into a big list, and then iterating through to call the get_data and write_data methods.
class Project(object):
def __init__(self, name):
self.name = name
self.commits = []
def add_commits(self, changes_list):
for change in changes_list:
change_id=change['change_id'],
revision_list=change['revisions']
self.commits.extend([Commit(rid, change_id)
for rid in revision_list.keys()])
def return_results(self, ger_obj, start=0):
self.ger = ger_obj
while True:
endpoint = (r'/changes/?q=project:{project}&o=ALL_REVISIONS&'
r'S={num}'.format(
project=self.name,
num=start
))
logging.info('Endpoint: {}'.format(endpoint))
try:
changes = ger_obj.get(endpoint)
self.add_commits(changes_list=changes)
except HTTPError:
break
start += 500
try:
if not changes[-1].get('_more_changes'):
break
except IndexError:
break
class Commit(object):
def __init__(self, rev_id, change_id):
self.rev_id = rev_id
self.change_id = change_id
def get_data(self, ger_obj):
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id[0],
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError as e:
logging.warning('Endpoint: {} did not return data'.format(
endpoint
))
else:
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = ETL_PDI_IPADDRESS
self.data['message'] = self.data['message'].replace('\n', ' ').replace('|', '[pipe]')
def write_data(self, writer):
writer.writerow(self.data)
I'm thinking that the best place to implement the threads is once I have all the commits in a list and am ready to iterate over them:
projects = [Project(value['id']) for value in project_data.values()]
for project in projects[:10]:
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest)
all_commits.extend(project.commits)
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
REDSHIFT_POSTGRES_INFO)
with open('testfile.csv', 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
# Implement Threading?
for commit in all_commits:
commit.get_data(rest)
try:
commit.write_data(writer=writer)
except AttributeError:
continue
except Exception:
print commit.data, 'caused an exception.'
continue
I've read a few threading tutorials, and am unsure as to how to properly do this. I'm particularly worried about overwriting data due to improper locking.

Categories

Resources