Aggregate the levels of the orderbook - python

How can I aggregate the levels of the orderbook?
Now it gets streamed every 0.05 tick but I need to stream this every 0.50 interval of price.
How can I unite the various levels on the orderbook?
This is the code:
from time import sleep
from pybit import usdt_perpetual
ws_linear = usdt_perpetual.WebSocket(
test=False,
ping_interval=30, # the default is 30
ping_timeout=10, # the default is 10
domain="bybit" # the default is "bybit"
)
def handle_message(msg):
print(msg)
ws_linear.orderbook_25_stream(
handle_message, "ETHUSDT"
)
while True:
sleep(1)
Thank you

Here you have usable orderbook if it helps.
Note: this code is not the cleanest, but it gets the job done.
import hmac
import json
import threading
import time
import websocket
ws_url = "wss://stream.bybit.com/realtime_public"
api_key = ""
api_secret = ""
order_book = []
orderbook = []
orderbook_size = []
orderbook_array_length = 1000
data = []
# Generate expires.
expires = int((time.time() + 1) * 1000)
# Generate signature.
signature = str(hmac.new(
bytes(api_secret, "utf-8"),
bytes(f"GET/realtime{expires}", "utf-8"), digestmod="sha256"
).hexdigest())
param = "api_key={api_key}&expires={expires}&signature={signature}".format(
api_key=api_key,
expires=expires,
signature=signature
)
url = ws_url + "?" + param
def on_message(ws, message):
global orderbook
global orderbook_size
message = json.loads(message)
for i in range(0, max(len(message['data']['update']), len(message['data']['insert']), len(message['data']['delete'])), 1):
if len(message['data']['update']) != 0 and i < len(message['data']['update']):
orderbook_size.append((message['data']['update'][i]['price'], message['data']['update'][i]['size'], message['data']['update'][i]['symbol']))
if len(orderbook_size) > orderbook_array_length:
del (orderbook_size[0])
data_append = [(message['data']['update'][i]['price'], message['data']['update'][i]['side'], message['data']['update'][i]['symbol'])]
if message['data']['update'][i]['side'] == 'Buy':
data_append_opposite = [(message['data']['update'][i]['price'], 'Sell', message['data']['update'][i]['symbol'])]
else:
data_append_opposite = [(message['data']['update'][i]['price'], 'Buy', message['data']['update'][i]['symbol'])]
if data_append_opposite[0] in orderbook:
orderbook.remove(data_append_opposite[0])
if data_append[0] not in orderbook:
orderbook.append((message['data']['update'][i]['price'], message['data']['update'][i]['side'], message['data']['update'][i]['symbol']))
orderbook.sort(key=lambda x: x[0])
if len(message['data']['insert']) != 0 and i < len(message['data']['insert']):
orderbook_size.append((message['data']['insert'][i]['price'], message['data']['insert'][i]['size'], message['data']['insert'][i]['symbol']))
if len(orderbook_size) > orderbook_array_length:
del (orderbook_size[0])
orderbook.append((message['data']['insert'][i]['price'], message['data']['insert'][i]['side'], message['data']['insert'][i]['symbol']))
orderbook.sort(key=lambda x: x[0])
if len(message['data']['delete']) != 0 and i < len(message['data']['delete']):
data_remove = [(message['data']['delete'][i]['price'], message['data']['delete'][i]['side'], message['data']['delete'][i]['symbol'])]
if data_remove[0] in orderbook:
orderbook.remove(data_remove[0])
def on_error(ws, error):
print(f"Error: {error}")
def on_close(ws):
print(f"Connection close")
def on_open(ws):
ws.send('{"op": "subscribe", "args": ["orderBookL2_25.BTCUSDT"]}')
print("Opened connection")
ws = websocket.WebSocketApp(url, on_open=on_open, on_message=on_message, on_error=on_error, on_close=on_close)
wst = threading.Thread(target=ws.run_forever)
wst.daemon = True
wst.start()
while True:
time.sleep(1)
order_book = []
try:
for i in range(0, len(orderbook), 1):
# entire orderbook
for s in range(-1,len(orderbook_size)* -1, -1):
if orderbook[i][0] == orderbook_size[s][0]:
order_book.append((orderbook[i][0], orderbook[i][1], orderbook_size[s][1]))
break
# orderbook with depth of 1
if orderbook[i - 1][1] == 'Buy' and orderbook[i][1] == 'Buy' and orderbook[i + 1][1] == 'Sell' and orderbook[i + 2][1] == 'Sell':
for r in range(i, len(orderbook), 1):
if len(orderbook) > r > i+25:
del (orderbook[r])
if r >= len(orderbook):
break
for r in range(i, 0, -1):
if 0 < r < i-25:
del (orderbook[r])
if r >= len(orderbook):
break
for s in range(-1,len(orderbook_size)* -1, -1):
if orderbook[i][0] == orderbook_size[s][0]:
data.append((orderbook[i][0], orderbook[i][1], orderbook_size[s][1]))
break
for s in range(-1,len(orderbook_size)* -1, -1):
if orderbook[i+1][0] == orderbook_size[s][0]:
data.append((orderbook[i + 1][0], orderbook[i + 1][1], orderbook_size[s][1]))
break
print(data)
data = []
print(order_book)
except Exception as error:
print(error)

Related

Opening and keeping the window on PySide6

I'm trying to convert a program into a mac app via pyside6. The program gets input about assignment names, dates, scores, percentage it takes in total, and calculates the total score. The program(in python) works fine, but I'm trying to convert into app, and am stuck. When running my program, the window does pop out but instantly closes after opening, and I cannot input in it as it closes right after opening.
The input on my program is a bit strange, but it's because it's adjusted to my school's website.
The original program code:
raw_grade = []
indexes = []
index = 0
score = []
percent = []
fi_score = []
raw = []
add = 0
denom = 0
total = 0
import sys
import re
from operator import itemgetter
print("Enter the data")
def multi_input():
try:
while True:
data=input()
if not data: break
yield data
except KeyboardInterrupt:
return
data = list(multi_input())
for i in range(3, len(data),4):
index = i
indexes.append(i)
raw_grade = [data[id] for id in indexes]
for i in range(len(raw_grade)):
gc = raw_grade[i].split('\t')
raw.append(gc)
for i in range(len(raw)):
percent.append(raw[i][3])
score.append(raw[i][4])
for i in range(len(score)):
use_score = score[i].split('/')
fi_score.append(use_score)
score = fi_score
score = [value for value in score if value != ['']]
for i in range(len(score)):
score[i] = [float(i) for i in score[i]]
for i in range(len(score)):
try:
score[i] = score[i][0]/score[i][1]100
except ZeroDivisionError:
result = 0
for i in range(len(percent)):
try:
percent[i] = percent[i].replace('%', '')
except ZeroDivisionError:
result = 0
percent = [value for value in percent if value != '']
for i in range(len(percent)):
try:
percent = [float(i) for i in percent]
except ZeroDivisionError:
result = 0
print("graded assignments: ", len(score))
#calculation part
for i in range(len(score)):
add = score[i]percent[i]/100
total = total + add
denom = denom + percent[i]
total = total/denom*100
print(f"{total:05.2f}")
What I've worked on so far:
from PySide6 import QtCore, QtWidgets
from PySide6.QtWidgets import QMainWindow, QWidget, QLabel, QLineEdit,QApplication
from PySide6.QtWidgets import QPushButton
from PySide6.QtCore import QSize
raw_grade = []
indexes = []
index = 0
score = []
percent = []
fi_score = []
raw = []
add = 0
denom = 0
total = 0
import sys
import re
from operator import itemgetter
def multi_input(self):
try:
while True:
self.nameLabel = QLabel(self)
self.nameLabel.setText('Name:')
self.line = QLineEdit(self)
if not self.line.text(): break
yield self.line.text()
except KeyboardInterrupt:
return
class MainWindow(QMainWindow):
def __init__(self):
QMainWindow.__init__(self)
raw_grade = []
indexes = []
index = 0
score = []
percent = []
fi_score = []
raw = []
add = 0
denom = 0
total = 0
self.setMinimumSize(QSize(320, 140))
self.setWindowTitle("PyQt Line Edit example (textfield) - pythonprogramminglanguage.com")
multi_input(self)
self.line = list(multi_input(self))
pybutton = QPushButton('OK', self)
pybutton.clicked.connect(self.clickMethod)
pybutton.resize(200,32)
pybutton.move(80, 60)
for i in range(3, len(self.line), 4):
index = i
indexes.append(i)
raw_grade = [self.line[id] for id in indexes]
score = fi_score
score = [value for value in score if value != ['']]
for i in range(len(raw_grade)):
gc = raw_grade[i].split('\t')
raw.append(gc)
for i in range(len(raw)):
percent.append(raw[i][3])
score.append(raw[i][4])
for i in range(len(score)):
use_score = score[i].split('/')
fi_score.append(use_score)
for i in range(len(score)):
score[i] = [float(i) for i in score[i]]
for i in range(len(score)):
try:
score[i] = score[i][0] / score[i][1] * 100
except ZeroDivisionError:
result = 0
for i in range(len(percent)):
try:
percent[i] = percent[i].replace('%', '')
except ZeroDivisionError:
result = 0
percent = [value for value in percent if value != '']
for i in range(len(percent)):
try:
percent = [float(i) for i in percent]
except ZeroDivisionError:
result = 0
print("graded assignments: ", len(score))
# calculation part
for i in range(len(score)):
add = score[i] * percent[i] / 100
total = total + add
denom = denom + percent[i]
self.line = total / denom * 100
print('Your name: ' + self.line.text())
def clickMethod(self):
print('Your name: ' + self.line.text())
if __name__ == "__main__":
app = QApplication(sys.argv)
window = MainWindow()
window.show()
app.exec()

Socket Programming Python Server and C client

this is what i make in python server code (receiving data length from C client and then receiving image file(.jpg))
but problem is when i receiving data length error occurs "invaild base64-encoded string"
here is my servercode (Removed unnecessary functions) and when trying to run the stringData part the error occurs
import os
import socket
import cv2
import numpy
import base64
import glob
import sys
import time
import threading
from datetime import datetime
class ServerSocket:
def receiveImages(self):
cnt_str = ''
cnt = 0
try:
while True:
if (cnt < 10):
cnt_str = '000' + str(cnt)
elif (cnt < 100):
cnt_str = '00' + str(cnt)
elif (cnt < 1000):
cnt_str = '0' + str(cnt)
else:
cnt_str = str(cnt)
if cnt == 0: startTime = time.localtime()
cnt += 1
length = self.conn.recv(64)
print(length)
length1 = length.decode('utf-8')
print(length1)
stringData = self.recvall(self.conn, int(length1))
## stringData = self.conn.recv(100000)
##print(stringData)
msg = "ready"
self.conn.sendall(msg.encode())
stime = self.recvall(self.conn, 64)
print('send time: ' + stime.decode('utf-8'))
now = time.localtime()
print('receive time: ' + datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f'))
data = numpy.frombuffer(base64.b64decode(stringData), numpy.uint8)
decimg = cv2.imdecode(data, 1)
cv2.imshow("image", decimg)
cv2.imwrite('./' + str(self.TCP_PORT) + '_images' + str(self.folder_num) + '/img' + cnt_str + '.jpg', decimg)
cv2.waitKey(1)
if (cnt == 60 * 10):
cnt = 0
convertThread = threading.Thread(target=self.convertImage(str(self.folder_num), 600, startTime))
convertThread.start()
self.folder_num = (self.folder_num + 1) % 2
except Exception as e:
print(e)
#self.convertImage(str(self.folder_num), cnt, startTime)
self.socketClose()
cv2.destroyAllWindows()
self.socketOpen()
self.receiveThread = threading.Thread(target=self.receiveImages)
self.receiveThread.start()
def recvall(self, sock, count):
buf = b''
while count:
newbuf = sock.recv(count)
if not newbuf: return None
buf += newbuf
count -= len(newbuf)
return buf
def convertImage(self, fnum, count, now):
img_array = []
cnt = 0
for filename in glob.glob('./' + str(self.TCP_PORT) + '_images' + fnum + '/*.jpg'):
if (cnt == count):
break
cnt = cnt + 1
img = cv2.imread(filename)
height, width, layers = img.shape
size = (width, height)
img_array.append(img)
file_date = self.getDate(now)
file_time = self.getTime(now)
name = 'video(' + file_date + ' ' + file_time + ').mp4'
file_path = './videos/' + name
out = cv2.VideoWriter(file_path, cv2.VideqqoWriter_fourcc(*'.mp4'), 20, size)
for i in range(len(img_array)):
out.write(img_array[i])
out.release()
print(u'complete')
def main():
server = ServerSocket('ServerIP', PORT)
if __name__ == "__main__":
main()
what could it be the problems? in my expectation when decode the string data it can be the problem that C client encode causes problem

list item to integer conversion triggers syntax error in python3

import re
import socket
import sys
def Check(ip,port):
try:
s = socket.socket()
s.settimeout(0.3)
s.connect((ip,port))
return s.recv(512)
except:
pass
def Scan():
start = sys.argv[1]
end = sys.argv[2]
endip = end.partition('.')
currentip = start.split('.')
while not (currentip == endip):
targetip = currentip[0]+"."+currentip[1]+"."+currentip[2]+"."+currentip[3]
print("Checking: "+targetip+"\n")
result = Check(targetip,21)
if result:
if re.search("FTP",result.decode('utf-8')):
retard = open('ftps.txt','a')
retard.write(targetip+"\n")
retard.close()
if (int(currentip[3]) < 255) and (int(currentip[0]) != int(endip[0])) and (int(currentip[1]) != int(endip[1])) and (int(currentip[2]) != int(endip[2])) and (int(currentip[3]) != int(endip[3])+1):
int(currentip[3]) += 1
elif (int(currentip[3]) == 255) and (int(currentip[0]) != int(endip[0])) and (int(currentip[1]) != int(endip[1])) and (int(currentip[2]) != int(endip[2])) and (int(currentip[3]) != int(endip[3])+1):
if (int(currentip[2]) < 255):
int(currentip[2]) += 1
int(currentip[3]) = 0
elif (int(currentip[2]) == 255):
if (int(currentip[1]) < 255):
int(currentip[1]) += 1
int(currentip[2]) = 0
int(currentip[3]) = 0
elif (int(currentip[0]) < int(endip[0])) and (int(currentip[0]) != 255) and (int(currentip[1]) == 255):
int(currentip[0]) += 1
int(currentip[1]) = 0
int(currentip[2]) = 0
int(currentip[3]) = 0
Scan()
int(currentip[0]) += 1 causes an error while the conversion of other items that are exactly the same way converted trigger none.
File "ftpscan.py", line 46
int(currentip[0]) += 1
^
SyntaxError: cannot assign to function call
Basically, i += 1, is same as i = i + 1
in your case
int(currentip[0]) += 1 , is same as int(currentip[0]) = int(currentip[0]) + 1
So, technically you can't assign to the function call.
Instead, this should work
currentip[0] = int(currentip[0]) + 1
You are trying to increment a number (what int() returns) as opposed to the contents of a variable.
"""
FTPScan by StYl3z
Greetz fly out to:
L0rd,Legolas,Prometheus,Smoky-Ice,izibitzi,Waterb0ng,MaXtOr
usage: python3 ftpscan.py <startip> <endip>
"""
import re
import socket
import sys
def Check(ip,port):
try:
s = socket.socket()
s.settimeout(0.3)
s.connect((ip,port))
return s.recv(512)
except:
pass
def Scan():
start = sys.argv[1]
end = sys.argv[2]
endip = end.split('.')
currentip = start.split('.')
while not (currentip == endip):
targetip = currentip[0]+"."+currentip[1]+"."+currentip[2]+"."+currentip[3]
print("Checking: "+targetip+"\n")
result = Check(targetip,21)
if result == 0:
if re.search("FTP",result.decode('utf-8')):
retard = open('ftps.txt','a')
retard.write(targetip+"\n")
retard.close()
if not (int(currentip[3])==255):
currentip[3] = int(currentip[3])+1
currentip[3] = str(currentip[3])
else:
if not(int(currentip[2])==255):
currentip[2] = int(currentip[2])+1
currentip[2] = str(currentip[2])
currentip[3] = str("0")
else:
if not(int(currentip[1])==255):
currentip[1] = int(currentip[1])+1
currentip[1] = str(currentip[1])
currentip[2] = str("0")
currentip[3] = str("0")
else:
if not(int(currentip[0])==255):
currentip[0] = int(currentip[0])+1
currentip[0] = str(currentip[0])
currentip[1] = str("0")
currentip[2] = str("0")
currentip[3] = str("0")
Scan()
is the way it's working, thanks for trying to help me, i figured it out myself

Confidence score of answer extracted using ELMo BiDAF model and AllenNLP

I'm working on a Deep Learning project where I use a bidirectional attention flow model (allennlp pretrained model)to make a question answering system.It uses squad dataset.The bidaf model extracts the answer span from paragraph.Is there any way to determine the confidence score(accuracy)or any other metrics of the answer extracted by the model?
I have used the subcommand evaluate from the allennlp package but it determines only score of the model after testing.I was hoping there is a much easier way to solve the issue using other such command.
Attaching the code and the terminal output below.
from rake_nltk import Rake
from string import punctuation
from nltk.corpus import stopwords
from allennlp.predictors.predictor import Predictor
import spacy
import wikipedia
import re
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import traceback
from nltk.stem import SnowballStemmer
from nltk.util import ngrams
from math import log10
from flask import Flask, request, jsonify, render_template
from gevent.pywsgi import WSGIServer
import time
import multiprocessing as mp
from gtts import gTTS
import os
NLP = spacy.load('en_core_web_md')
stop = stopwords.words('english')
symbol = r"""!#$%^&*();:\n\t\\\"!\{\}\[\]<>-\?"""
stemmer = SnowballStemmer('english')
wikipedia.set_rate_limiting(True)
session = HTMLSession()
results = 5
try:
predictor = Predictor.from_path("bidaf-model-2017.09.15-charpad.tar.gz")
except:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2018.11.30-charpad.tar.gz")
try:
srl = Predictor.from_path('srl-model-2018.05.25.tar.gz')
except:
srl = Predictor.from_path('https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz')
key = Rake(min_length=1, stopwords=stop, punctuations=punctuation, max_length=6)
wh_words = "who|what|how|where|when|why|which|whom|whose|explain".split('|')
stop.extend(wh_words)
session = HTMLSession()
output = mp.Queue()
def termFrequency(term, doc):
normalizeTermFreq = re.sub('[\[\]\{\}\(\)]', '', doc.lower()).split()
normalizeTermFreq = [stemmer.stem(i) for i in normalizeTermFreq]
dl = len(normalizeTermFreq)
normalizeTermFreq = ' '.join(normalizeTermFreq)
term_in_document = normalizeTermFreq.count(term)
#len_of_document = len(normalizeTermFreq )
#normalized_tf = term_in_document / len_of_document
normalized_tf = term_in_document
return normalized_tf, normalizeTermFreq, dl#, n_unique_term
def inverseDocumentFrequency(term, allDocs):
num_docs_with_given_term = 0
for doc in allDocs:
if term in doc:
num_docs_with_given_term += 1
if num_docs_with_given_term > 0:
total_num_docs = len(allDocs)
idf_val = log10(((total_num_docs+1) / num_docs_with_given_term))
term_split = term.split()
if len(term_split) == 3:
if len([term_split[i] for i in [0, 2] if term_split[i] not in stop]) == 2:
return idf_val*1.5
return idf_val
return idf_val
else:
return 0
def sent_formation(question, answer):
tags_doc = NLP(question)
tags_doc_cased = NLP(question.title())
tags_dict_cased = {i.lower_:i.pos_ for i in tags_doc_cased}
tags_dict = {i.lower_:i.pos_ for i in tags_doc}
question_cased = []
for i in question[:-1].split():
if tags_dict[i] == 'PROPN' or tags_dict[i] == 'NOUN':
question_cased.append(i.title())
else:
question_cased.append(i.lower())
question_cased.append('?')
question_cased = ' '.join(question_cased)
#del tags_dict,tags_doc, tags_doc_cased
pre = srl.predict(question_cased)
verbs = []
arg1 = []
for i in pre['verbs']:
verbs.append(i['verb'])
if 'B-ARG1' in i['tags']:
arg1.append((i['tags'].index('B-ARG1'), i['tags'].count('I-ARG1'))\
if not pre['words'][i['tags'].index('B-ARG1')].lower() in wh_words else \
(i['tags'].index('B-ARG2'), i['tags'].count('I-ARG2')))
arg1 = arg1[0] if arg1 else []
if not arg1:
verb_idx = pre['verbs'][0]['tags'].index('B-V')
verb = pre['words'][verb_idx] if pre['words'][verb_idx] != answer.split()[0].lower() else ''
subj_uncased = pre['words'][verb_idx+1:] if pre['words'][-1] not in symbol else \
pre['words'][verb_idx+1:-1]
else:
verb = ' '.join(verbs)
subj_uncased = pre['words'][arg1[0]:arg1[0]+arg1[1]+1]
conj = ''
if question.split()[0].lower() == 'when':
conj = ' on' if len(answer.split()) > 1 else ' in'
subj = []
for n, i in enumerate(subj_uncased):
if tags_dict_cased[i.lower()] == 'PROPN' and tags_dict[i.lower()] != 'VERB' or n == 0:
subj.append(i.title())
else:
subj.append(i.lower())
subj[0] = subj[0].title()
print(subj)
print(pre)
subj = ' '.join(subj)
sent = "{} {}{} {}.".format(subj, verb, conj, answer if answer[-1] != '.' else answer[:-1])
return sent
class extractAnswer:
def __init__(self):
self.wiki_error = (wikipedia.exceptions.DisambiguationError,
wikipedia.exceptions.HTTPTimeoutError,
wikipedia.exceptions.WikipediaException)
self.article_title = None
# symbol = """!#$%^&*();:\n\t\\\"!\{\}\[\]<>-\?"""
def extractAnswer_model(self, passage, question, s=0.4, e=0.3, wiki=False):
if type(passage) == list:
passage = " ".join(passage)
if not question[-1] == '?':
question = question+'?'
pre = predictor.predict(passage=passage, question=question)
if wiki:
if max(pre['span_end_probs']) > 0.5:
s = 0.12
elif max(pre['span_end_probs']) > 0.4:
s = 0.13
elif max(pre['span_end_probs']) > 0.35:
s = 0.14
if max(pre['span_start_probs']) > 0.5:
e = 0.12
elif max(pre['span_start_probs']) > 0.4:
e = 0.14
elif max(pre['span_start_probs']) > 0.3:
e = 0.15
if max(pre['span_start_probs']) > s and max(pre['span_end_probs']) > e:
key.extract_keywords_from_text(question)
ques_key = [stemmer.stem(i) for i in ' '.join(key.get_ranked_phrases())]
key.extract_keywords_from_text(passage)
pass_key = [stemmer.stem(i) for i in ' '.join(key.get_ranked_phrases())]
l = len(ques_key)
c = 0
for i in ques_key:
if i in pass_key:
c += 1
if c >= l/2:
print(max(pre['span_start_probs']),
max(pre['span_end_probs']))
if wiki:
return pre['best_span_str'], max(pre['span_start_probs']) + max(pre['span_end_probs'])
try:
ans = sent_formation(question, pre['best_span_str'])
except:
ans = pre['best_span_str']
print(traceback.format_exc())
return ans
print(ques_key, c, l)
print(max(pre['span_start_probs']), max(pre['span_end_probs']))
return 0, 0
else:
print(max(pre['span_start_probs']), max(pre['span_end_probs']), pre['best_span_str'])
return 0, 0
def wiki_search_api(self, query):
article_list = []
try:
article_list.extend(wikipedia.search(query, results=results))
print(article_list)
return article_list
except self.wiki_error:
params = {'search': query, 'profile': 'engine_autoselect',
'format': 'json', 'limit': results}
article_list.extend(requests.get('https://en.wikipedia.org/w/api.php?action=opensearch',
params=params).json()[1])
return article_list
except:
print('Wikipedia search error!')
print(traceback.format_exc())
return 0
def wiki_passage_api(self, article_title, article_list, output):
# Disambiguation_title = {}
try:
passage = wikipedia.summary(article_title)
output.put((article_title, self.passage_pre(passage)))
except wikipedia.exceptions.DisambiguationError as e:
print(e.options[0], e.options)
Disambiguation_pass = {}
for p in range(2 if len(e.options) > 1 else len(e.options)):
params = {'search':e.options[p], 'profile':'engine_autoselect', 'format':'json'}
article_url = requests.get('https://en.wikipedia.org/w/api.php?action=opensearch',
params=params).json()
if not article_url[3]:
continue
article_url = article_url[3][0]
r = session.get(article_url)
soup = BeautifulSoup(r.html.raw_html)
print(soup.title.string)
article_title_dis = soup.title.string.rsplit('-')[0].strip()
if article_title_dis in article_list:
print('continue')
continue
try:
url = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles={}".format(article_title_dis)
passage = requests.get(url).json()['query']['pages']
for i in passage.keys():
if 'extract' in passage[i]:
Disambiguation_pass[article_title_dis] = self.passage_pre(passage[i]['extract'])
except wikipedia.exceptions.HTTPTimeoutError:
passage = wikipedia.summary(article_title_dis)
Disambiguation_pass[article_title_dis] = self.passage_pre(passage)
except:
Disambiguation_pass[article_title_dis] = ''
continue
output.put((article_title, Disambiguation_pass))
except:
output.put((article_title, ''))
print(traceback.format_exc())
def sorting(self, article, question, topic):
processes = [mp.Process(target=self.wiki_passage_api, args=(article[x], article, output))\
for x in range(len(article))]
for p in processes:
p.start()
for p in processes:
p.join(timeout=3)
results_p = [output.get() for p in processes]
article_list = []
passage_list = []
for i, j in results_p:
if type(j) != dict and j:
article_list.append(i)
passage_list.append(j)
elif type(j) == dict and j:
for k, l in j.items():
if l:
article_list.append(k)
passage_list.append(l)
normalize_passage_list = []
start = time.time()
keywords = " ".join(self.noun+self.ques_key+[topic.lower()])
keywords = re.sub('[{0}]'.format(symbol), ' ', keywords).split()
question = question+' '+topic
ques_tokens = [stemmer.stem(i.lower()) for i in question.split() \
if i.lower() not in wh_words]
print(ques_tokens)
keywords_bigram = [' '.join(i) for i in list(ngrams(ques_tokens, 2)) \
if i[0] not in stop and i[1] not in stop]
if len(ques_tokens) > 3:
keywords_trigram = [' '.join(i) for i in list(ngrams(ques_tokens, 3)) \
if (i[0] in stop) + (i[2] in stop) + (i[1] in stop) < 3]
else:
keywords_trigram = []
if len(ques_tokens) > 5:
keywords_4gram = [' '.join(i) for i in list(ngrams(ques_tokens, 4)) \
if (i[0] in stop) + (i[2] in stop) +(i[1] in stop)+(i[3] in stop) < 4]
else:
keywords_4gram = []
keywords_unigram = list(set([stemmer.stem(i.lower()) for i in keywords \
if i.lower() not in stop]))
keywords = keywords_unigram+list(set(keywords_bigram))+keywords_trigram+keywords_4gram
tf = []
if not passage_list:
return 0
pass_len = []
#n_u_t=[]
#key_dict = {i: keywords.count(i) for i in keywords}
print('Extraction complete')
#remove_pass={}
#for n,i in enumerate(passage_list):
#if len(i)<200 or not i:
#remove_pass[article_list[n]]=i
#print(n, article_list[n])
#passage_list=[i for i in passage_list if i not in remove_pass.values()]
#article_list=[i for i in article_list if i not in remove_pass.keys()]
passage_list_copy = passage_list.copy()
article_list_copy = article_list.copy()
for i in range(len(passage_list_copy)):
if passage_list.count(passage_list_copy[i]) > 1:
passage_list.remove(passage_list_copy[i])
article_list.remove(article_list_copy[i])
print('Copy:', article_list_copy[i])
del passage_list_copy
del article_list_copy
for n, i in enumerate(passage_list):
temp_tf = {}
c = 0
for j in keywords:
temp_tf[j], temp_pass, temp_len = termFrequency(j, i + ' ' + article_list[n])
if temp_tf[j]:
c += 1
normalize_passage_list.append(temp_pass)
pass_len.append(temp_len)
temp_tf['key_match'] = c
tf.append(temp_tf)
print(pass_len)
print(keywords)
idf = {}
for i in keywords:
idf[i] = inverseDocumentFrequency(i, normalize_passage_list)
#print(tf, idf)
tfidf = []
#b=0.333 #for PLN
b, k = 0.75, 1.2 #for BM25
avg_pass_len = sum(pass_len)/len(pass_len)
#pivot=sum(n_u_t)/len(n_u_t)
for n, i in enumerate(tf):
tf_idf = 0
#avg_tf=sum(i.values())/len(i)
key_match_ratio = i['key_match']/len(keywords)
for j in keywords:
#tf_idf+=idf[j]*((log(1+log(1+i[j])))/(1-b+(b*pass_len[n]/avg_pass_len))) #PLN
tf_idf += idf[j]*(((k+1)*i[j])/(i[j]+k*(1-b+(b*pass_len[n]/avg_pass_len)))) #BM25
tfidf.append(tf_idf*key_match_ratio)
tfidf = [i/sum(tfidf)*100 for i in tfidf if any(tfidf)]
if not tfidf:
return 0, 0, 0, 0, 0
print(tfidf)
print(article_list, len(passage_list))
if len(passage_list) > 1:
sorted_tfidf = sorted(tfidf, reverse=1)
idx1 = tfidf.index(sorted_tfidf[0])
passage1 = passage_list[idx1]
#article_title=
tfidf1 = sorted_tfidf[0]
idx2 = tfidf.index(sorted_tfidf[1])
passage2 = passage_list[idx2]
article_title = (article_list[idx1], article_list[idx2])
tfidf2 = sorted_tfidf[1]
else:
article_title = 0
tfidf2 = 0
if passage_list:
passage1 = passage_list[0]
tfidf1 = tfidf[0]
passage2 = 0
else:
passage1 = 0
passage2 = 0
tfidf1, tfidf2 = 0, 0
end = time.time()
print('TFIDF time:', end-start)
return passage1, passage2, article_title, tfidf1, tfidf2
def passage_pre(self, passage):
#passage=re.findall("[\da-zA-z\.\,\'\-\/\–\(\)]*", passage)
passage = re.sub('\n', ' ', passage)
passage = re.sub('\[[^\]]+\]', '', passage)
passage = re.sub('pronunciation', '', passage)
passage = re.sub('\\\\.+\\\\', '', passage)
passage = re.sub('{.+}', '', passage)
passage = re.sub(' +', ' ', passage)
return passage
def wiki(self, question, topic=''):
if not question:
return 0
question = re.sub(' +', ' ', question)
question = question.title()
key.extract_keywords_from_text(question)
self.ques_key = key.get_ranked_phrases()
doc = NLP(question)
self.noun = [str(i).lower() for i in doc.noun_chunks if str(i).lower() not in wh_words]
print(self.ques_key, self.noun)
question = re.sub('[{0}]'.format(symbol), ' ', question)
if not self.noun + self.ques_key:
return 0
article_list = None
question = question.lower()
if self.noun:
if len(self.noun) == 2 and len(" ".join(self.noun).split()) < 6:
#question1=question
self.noun = " ".join(self.noun).split()
if self.noun[0] in stop:
self.noun.pop(0)
self.noun = question[question.index(self.noun[0]):question.index(self.noun[-1]) \
+len(self.noun[-1])+1].split()
#del question1
print(self.noun)
article_list = self.wiki_search_api(' '.join(self.noun))
if self.ques_key and not article_list:
article_list = self.wiki_search_api(self.ques_key[0])
if not article_list:
article_list = self.wiki_search_api(' '.join(self.ques_key))
if not article_list:
print('Article not found on wikipedia.')
return 0, 0
article_list = list(set(article_list))
passage1, passage2, article_title, tfidf1, tfidf2 = self.sorting(article_list,
question, topic)
if passage1:
ans1, conf1 = self.extractAnswer_model(passage1, question, s=0.20, e=0.20, wiki=True)
else:
ans1, conf1 = 0, 0
if ans1:
conf2 = 0
if len(ans1) > 600:
print(ans1)
print('Repeat')
ans1, conf1 = self.extractAnswer_model(ans1, question, s=0.20, e=0.20, wiki=True)
threshhold = 0.3 if not ((tfidf1- tfidf2) <= 10) else 0.2
if round(tfidf1- tfidf2) < 5:
threshhold = 0
if (tfidf1- tfidf2) > 20:
threshhold = 0.35
if (tfidf1- tfidf2) > 50:
threshhold = 1
if (passage2 and conf1 < 1.5) or (tfidf1 - tfidf2) < 10:
ans2, conf2 = self.extractAnswer_model(passage2, question, s=0.20, e=0.20,
wiki=True) if passage2 else (0, 0)
title = 0
if round(conf1, 2) > round(conf2, 2) - threshhold:
print('ans1')
ans = ans1
title = article_title[0] if article_title else 0
else:
print('ans2')
title = article_title[1] if article_title else 0
ans = ans2
if not question[-1] == '?':
question = question+'?'
try:
ans = sent_formation(question, ans)
except:
print(traceback.format_exc())
print(ans, '\n', '\n', article_title)
return ans, title
extractor = extractAnswer()
app = Flask(__name__)
#app.route("/", methods=["POST", "get"])
#app.route("/ans")
def ans():
start = time.time()
question = request.args.get('question')
topic = request.args.get('topic')
passage = request.args.get('passage')
if not question:
return render_template('p.html')
if not topic:
topic = ''
if passage:
answer = extractor.extractAnswer_model(passage, question)
else:
answer, title = extractor.wiki(question, topic)
end = time.time()
if answer:
mytext = str(answer)
language = 'en'
myobj = gTTS(text=mytext, lang=language, slow=False)
myobj.save("welcome.mp3")
# prevName = 'welcome.mp3'
#newName = 'static/welcome.mp3'
#os.rename(prevName,newName)
return render_template('pro.html', answer=answer)
else:
return jsonify(Status='E', Answer=answer, Time=end-start)
#app.route("/audio_del/", methods=["POST", "get"])
def audio_del():
return render_template('p.html');
#app.route("/audio_play/", methods=["POST", "get"])
def audio_play():
os.system("mpg321 welcome.mp3")
return render_template('white.html')
if __name__ == "__main__":
PORT = 7091
HTTP_SERVER = WSGIServer(('0.0.0.0', PORT), app)
print('Running on',PORT, '...')
HTTP_SERVER.serve_forever()
![Output in the terminal for a question I've asked](https://i.stack.imgur.com/6pyv5.jpg)
I came across a possible solution to this after deeply looking into the output returned by the model. Although this, is probably not something you can accurately rely on, it seemed to have done the task in my case:
Note that the text answer which is "best_span_str" is always a subarray of the passage. It spans the range which is stored in "best_span".
i.e., "best_span" contains the start and end index of the answer.
Now, the output data contains a property named "span_end_probs".
"span_end_probs" contains a list of values that correspond to all the words present in the text input.
If you look closely for various inputs, the value is always maximum at one of the indexes within the starting and ending range that "best_span" contains. This value seemed to be very similar to the confidence levels that we need. Let's call this value score. All you need to do now is to try some inputs and find a suitable method to use this score as a metric.
e.g.: if you need a threshold value for some application, you can try a number of test inputs and find a value that is most accurate. In my case, this was around 0.35.
i.e. if score is lesser than 0.35, it prints answer not found and if greater than or equal 0.35, prints string in "best_span_str".
Here's my code snippet:
from allennlp.predictors.predictor import Predictor
passage = '--INPUT PASSAGE--'
question = '--INPUT QUESTION--'
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bidaf-elmo.2021-02-11.tar.gz")
output = predictor.predict(
passage = passage,
question = question
)
score = max(output["span_end_probs"])
if score < 0.35:
print('Answer not found')
else:
print(output["best_span_str"])
You can readily see the example input and output here.

Issue with Python garbage collector?

I have a simple program which reads a large file containing few million rows, parses each row (numpy array) and converts into an array of doubles (python array) and later writes into an hdf5 file. I repeat this loop for multiple days. After reading each file, i delete all the objects and call garbage collector. When I run the program, First day is parsed without any error but on the second day i get MemoryError. I monitored the memory usage of my program, during first day of parsing, memory usage is around 1.5 GB. When the first day parsing is finished, memory usage goes down to 50 MB. Now when 2nd day starts and i try to read the lines from the file I get MemoryError. Following is the output of the program.
source file extracted at C:\rfadump\au\2012.08.07.txt
parsing started
current time: 2012-09-16 22:40:16.829000
500000 lines parsed
1000000 lines parsed
1500000 lines parsed
2000000 lines parsed
2500000 lines parsed
3000000 lines parsed
3500000 lines parsed
4000000 lines parsed
4500000 lines parsed
5000000 lines parsed
parsing done.
end time is 2012-09-16 23:34:19.931000
total time elapsed 0:54:03.102000
repacking file
done
> s:\users\aaj\projects\pythonhf\rfadumptohdf.py(132)generateFiles()
-> while single_date <= self.end_date:
(Pdb) c
*** 2012-08-08 ***
source file extracted at C:\rfadump\au\2012.08.08.txt
cought an exception while generating file for day 2012-08-08.
Traceback (most recent call last):
File "rfaDumpToHDF.py", line 175, in generateFile
lines = self.rawfile.read().split('|\n')
MemoryError
I am very sure that windows system task manager shows the memory usage as 50 MB for this process. It looks like the garbage collector or memory manager for Python is not calculating the free memory correcly. There should be lot of free memory but it thinks there is not enough.
Any idea?
EDIT
Adding my code here
I will put parts of my code. I am new to python, please pardon my python coding style.
module 1
def generateFile(self, current_date):
try:
print "*** %s ***" % current_date.strftime("%Y-%m-%d")
weekday=current_date.weekday()
if weekday >= 5:
print "skipping weekend"
return
self.taqdb = taqDB(self.index, self.offset)
cache_filename = os.path.join(self.cache_dir,current_date.strftime("%Y.%m.%d.h5"))
outputFile = config.hdf5.filePath(self.index, date=current_date)
print "cache file: ", cache_filename
print "output file: ", outputFile
tempdir = "C:\\rfadump\\"+self.region+"\\"
input_filename = tempdir + filename
print "source file extracted at %s " % input_filename
## universe
reader = rfaTextToTAQ.rfaTextToTAQ(self.tickobj) ## PARSER
count = 0
self.rawfile = open(input_filename, 'r')
lines = self.rawfile.read().split('|\n')
total_lines = len(lines)
self.rawfile.close()
del self.rawfile
print "parsing started"
start_time = dt.datetime.now()
print "current time: %s" % start_time
#while(len(lines) > 0):
while(count < total_lines):
#line = lines.pop(0) ## This slows down processing
result = reader.parseline(lines[count]+"|")
count += 1
if(count % 500000 == 0):
print "%d lines parsed" %(count)
if(result == None):
continue
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = result
if(len(levelsUpdated) == 0 and tradeupdate == False):
continue
self.taqdb.insert(result)
## write to hdf5 TODO
writer = h5Writer.h5Writer(cache_filename, self.tickobj)
writer.write(self.taqdb.groups)
writer.close()
del lines
del self.taqdb, self.tickobj
##########################################################
print "parsing done."
end_time = dt.datetime.now()
print "end time is %s" % end_time
print "total time elapsed %s" % (end_time - start_time)
defragger = hdf.HDF5Defragmenter()
defragger.Defrag(cache_filename,outputFile)
del defragger
print "done"
gc.collect(2)
except:
print "cought an exception while generating file for day %s." % current_date.strftime("%Y-%m-%d")
tb = traceback.format_exc()
print tb
module 2 - taqdb - to store parsed data in an array
class taqDB:
def __init__(self, index, offset):
self.index = index
self.tickcfg = config.hdf5.getTickConfig(index)
self.offset = offset
self.groups = {}
def getGroup(self,ric):
if (self.groups.has_key(ric) == False):
self.groups[ric] = {}
return self.groups[ric]
def getOrderbookArray(self, ric, group):
datasetname = orderBookName
prodtype = self.tickcfg.getProdType(ric)
if(prodtype == ProdType.INDEX):
return
orderbookArrayShape = self.tickcfg.getOrderBookArrayShape(prodtype)
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
orderbookArray = self.tickcfg.getOrderBookArray(prodtype)
return orderbookArray
else:
orderbookArray = group[datasetname]
if(len(orderbookArray) == 0):
return self.tickcfg.getOrderBookArray(prodtype)
lastOrderbook = orderbookArray[-orderbookArrayShape[1]:]
return np.array([lastOrderbook])
def addToDataset(self, group, datasetname, timestamp, arr):
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
arr[0,0]=timestamp
a1 = group[datasetname]
a1.extend(arr[0])
def addToOrderBook(self, group, timestamp, arr):
self.addToDataset(self, group, orderBookName, timestamp, arr)
def insert(self, data):
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = data
delta = dt.timedelta(hours=timestamp.hour,minutes=timestamp.minute, seconds=timestamp.second, microseconds=(timestamp.microsecond/1000))
timestamp = float(str(delta.seconds)+'.'+str(delta.microseconds)) + self.offset
## write to array
group = self.getGroup(ric)
orderbookUpdate = False
orderbookArray = self.getOrderbookArray(ric, group)
nonzero = quotes.nonzero()
orderbookArray[nonzero] = quotes[nonzero]
if(np.any(nonzero)):
self.addToDataset(group, orderBookName, timestamp, orderbookArray)
if(tradeupdate == True):
self.addToDataset(group, tradeName, timestamp, trades)
Module 3- Parser
class rfaTextToTAQ:
"""RFA Raw dump file reader. Readers single line (record) and returns an array or array of fid value pairs."""
def __init__(self,tickconfig):
self.tickconfig = tickconfig
self.token = ''
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.quotes = np.array([]) # read from tickconfig
self.trades = np.array([]) # read from tickconfig
self.prodtype = ProdType.STOCK
self.allquotes = {}
self.alltrades = {}
self.acvol = 0
self.levelsUpdated = []
self.quoteUpdate = False
self.tradeUpdate = False
self.depth = 0
def updateLevel(self, index):
if(self.levelsUpdated.__contains__(index) == False):
self.levelsUpdated.append(index)
def updateQuote(self, fidindex, field):
self.value = float(self.value)
if(self.depth == 1):
index = fidindex[0]+(len(self.tickconfig.stkQuotes)*(self.depth - 1))
self.quotes[index[0]][fidindex[1][0]] = self.value
self.updateLevel(index[0])
else:
self.quotes[fidindex] = self.value
self.updateLevel(fidindex[0][0])
self.quoteUpdate = True
def updateTrade(self, fidindex, field):
#self.value = float(self.value)
if(self.tickconfig.tradeUpdate(self.depth) == False):
return
newacvol = float(self.value)
if(field == acvol):
if(self.value > self.acvol):
tradesize = newacvol - self.acvol
self.acvol = newacvol
self.trades[fidindex] = tradesize
if(self.trades.__contains__(0) == False):
self.tradeUpdate = True
else:
self.trades[fidindex] = self.value
if(not (self.trades[0,1]==0 or self.trades[0,2]==0)):
self.tradeUpdate = True
def updateResult(self):
field = ''
valid, field = field_dict.FIDToField(int(self.fid), field)
if(valid == False):
return
if(self.value == '0'):
return
if(self.prodtype == ProdType.STOCK):
fidindex = np.where(self.tickconfig.stkQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.stkTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
else:
fidindex = np.where(self.tickconfig.futQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.futTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
def getOrderBookTrade(self):
if (self.allquotes.has_key(self.ric) == False):
acvol = 0
self.allquotes[self.ric] = self.tickconfig.getOrderBookArray(self.prodtype)
trades = self.tickconfig.getTradesArray()
self.alltrades[self.ric] = [trades, acvol]
return self.allquotes[self.ric], self.alltrades[self.ric]
def parseline(self, line):
self.tradeUpdate = False
self.levelsUpdated = []
pos = 0
length = len(line)
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.token = ''
ch = ''
while(pos < length):
prevChar = ch
ch = line[pos]
pos += 1
#SEQ_NUM
if(self.state == ReadState.SEQ_NUM):
if(ch != ','):
self.token += ch
else:
self.seq_num = int(self.token)
self.state = ReadState.TIMESTAMP
self.token = ''
# TIMESTAMP
elif(self.state == ReadState.TIMESTAMP):
if(ch == ' '):
self.token = ''
elif(ch != ','):
self.token += ch
else:
if(len(self.token) != 12):
print "Invalid timestamp format. %s. skipping line.\n", self.token
self.state = ReadState.SKIPLINE
else:
self.timestamp = datetime.strptime(self.token,'%H:%M:%S.%f')
self.state = ReadState.RIC
self.token = ''
# RIC
elif(self.state == ReadState.RIC):
if(ch != ','):
self.token += ch
else:
self.ric = self.token
self.token = ''
self.ric, self.depth = self.tickconfig.replaceRic(self.ric)
self.prodtype = self.tickconfig.getProdType(self.ric)
if(self.tickconfig.subscribed(self.ric)):
self.state = ReadState.UPDATE_TYPE
self.quotes, trades = self.getOrderBookTrade()
self.trades = trades[0]
self.acvol = trades[1]
else:
self.state = ReadState.SKIPLINE
# UPDATE_TYPE
elif(self.state == ReadState.UPDATE_TYPE):
if(ch != '|'):
self.token += ch
else:
self.update_type = self.token
self.token = ''
self.state = ReadState.FVPAIRS
#SKIPLINE
elif(self.state == ReadState.SKIPLINE):
return None
# FV PAIRS
elif(self.state == ReadState.FVPAIRS):
# FID
if(self.fvstate == fvstate.FID):
if(ch != ','):
if(ch.isdigit() == False):
self.token = self.value+ch
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
else:
self.token += ch
else:
self.fid = self.token
self.token = ''
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
# FIDVALUE
elif(self.fvstate == fvstate.FIDVALUE):
if(ch != '|'):
self.token += ch
else:
self.value = self.token
self.token = ''
self.state = ReadState.FVPAIRS
self.fvstate = fvstate.FID
# TODO set value
self.updateResult()
return self.ric, self.timestamp, self.quotes, self.trades, self.levelsUpdated, self.tradeUpdate
Thanks.
The only reliable way to free memory is to terminate the process.
So, if your main program spawns a worker process to do most of the work (the stuff that is done in one day) then when that worker process completes, the memory used will be freed:
import multiprocessing as mp
def work(date):
# Do most of the memory-intensive work here
...
while single_date <= self.end_date:
proc = mp.Process(target = work, args = (single_date,))
proc.start()
proc.join()

Categories

Resources