Multiprocessing and relationship traversal? - python

I am trying to implement multiprocessing to speed up traversing a relationship graph. I want to capture items that have a total less than 1000. If the parent is over 1000, process the children until there's no more to check.
I've mocked up an illustration that shows that ThreadPoolExecutor only processes the initial items provided to the class while the class.search_queue_list list is still populated. I also tried using a Queue instead of a list with similar results. Synchronous processing works as expected for list and Queue. Is there a way to make multiprocessing work here when the initial array of items can change?
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from time import sleep
dummy_data = {
'id1': {'total': 1001, 'children': ['id101','id102']}, # over 1000, children will be processed
'id2': {'total': 999, 'children': ['id201','id202']}, # under 1000, children won't be processed
'id101': {'total': 501, 'children': ['more_children']},
'id102': {'total': 500, 'children': ['more_children']},
'id201': {'total': 499,'children': ['more_children']},
'id202': {'total': 500, 'children': ['more_children']},
}
class SearchDummy(object):
def __init__(self, start_list):
# with list
self.search_queue_list = start_list
# with Queue
self.search_queue_queue = Queue()
for item in self.search_queue_list:
self.search_queue_queue.put(item)
self.good_ids = []
def get_total(self, search_id):
# artificial delay
sleep(0.5)
return dummy_data[search_id]['total']
def get_children(self, search_id):
# artificial delay
sleep(0.5)
return dummy_data[search_id]['children']
# START LIST
def current_check_list(self):
# get first element in search_queue_list
current_id = self.search_queue_list.pop(0)
# check if current_id['total'] is over 1000
if self.get_total(current_id) <= 1000:
self.good_ids.append(current_id)
else:
# prepend children to search_queue_list
self.search_queue_list.extend(self.get_children(current_id))
def search_list(self):
while self.search_queue_list:
self.current_check_list()
def multi_search_list(self):
with ThreadPoolExecutor() as e:
while self.search_queue_list:
e.submit(self.current_check_list)
# END LIST
# START QUEUE
def current_check_queue(self):
# get item from search_queue_queue
current_id = self.search_queue_queue.get()
# check if current_id['total'] is over 1000
if self.get_total(current_id) <= 1000:
self.good_ids.append(current_id)
else:
# put children in search_queue_queue
for child in self.get_children(current_id):
self.search_queue_queue.put(child)
def search_queue(self):
while not self.search_queue_queue.empty():
self.current_check_queue()
def multi_search_queue(self):
with ThreadPoolExecutor() as e:
while not self.search_queue_queue.empty():
e.submit(self.current_check_queue)
# END QUEUE
# synchronous list
s = SearchDummy(['id1','id2'])
s.search_list()
print('List output', s.good_ids) # returns ['id101', 'id102', 'id2']
print('Remaining list size', len(s.search_queue_list)) # returns 0
# synchronous queue
s = SearchDummy(['id1','id2'])
s.search_queue()
print('Queue output', s.good_ids) # returns ['id101', 'id102', 'id2']
print('Remaining queue size', s.search_queue_queue.qsize()) # returns 0
# multiprocessing list
s = SearchDummy(['id1','id2'])
s.multi_search_list()
print('Multi list output', s.good_ids) # returns ['id2']
print('Multi list remaining', s.search_queue_list) # returns ['id101', 'id102']
# multiprocessing queue
s = SearchDummy(['id1','id2'])
s.multi_search_queue()
print('Multi queue output', s.good_ids) # returns ['id2']
print('Multi queue remaining', list(s.search_queue_queue.queue)) # returns ['id101', 'id102']

Related

Run a function in background using thread in Flask

I am trying to implement limit order book using flask and I am working on the backend part right now. I am new to flask so I am still learning and I am not much aware about how backend of trading works but I am trying to learn via this small project.
I have created 3 endpoints in my application which add order, remove order and give a response of the order status and these three endpoints are working fine checked them with postman. Now I am trying to run a function in background which will continuously check the new orders (buy/sell) from a json file which save all new orders. It will pick them one by one and will find a match based on price if a user's buy order matches a different user's sell order it will process and store it in a dict which I want to return or store all those successful order to the user.
Here is my code for the class I have created:
import json
import bisect
import random
import os
class Process(object):
def __init__(self):
self.trade_book = []
self.bid_prices = []
self.ask_prices = []
self.ask_book = {}
self.bid_book = {}
self.confirm_traded = []
self.orders_history = {}
self.traded = False
self.counter = 0
def save_userdata(self,order, newId):
orderid = order['order']['trader'] +"_"+ str(newId)
user_list = order
newJson = {
"orders":[
{ orderid: order['order']}
]
}
with open('data_user.json', 'a+') as jsonFile:
with open('data_user.json', 'r') as readableJson:
try:
jsonObj = json.load(readableJson)
except Exception as e:
jsonObj = {}
if jsonObj == {}:
json.dump(newJson, jsonFile)
else:
with open('data_user.json', 'w+') as writeFile:
exists = False
for item in jsonObj['orders']:
if item.get(orderid, None) is not None:
item[orderid] = order['order']
exists = True
break
if not exists:
jsonObj['orders'].append(newJson['orders'][0])
json.dump(jsonObj, writeFile)
return orderid
def get_userdata(self):
with open('data_user.json', 'r') as readableJson:
return json.load(readableJson)
def removeOrder(self, orderid):
order_id = list(orderid.values())[0]
with open('data_user.json') as data_file:
data = json.load(data_file)
newData = []
for item in data['orders']:
if item.get(order_id, None) is not None:
del item[order_id]
else:
newData.append(item)
data['orders'] = newData
with open('data_user.json', 'w') as data_file:
data = json.dump(data, data_file)
return order_id
def add_order_to_book(self, order):
index = list(order.keys())[0]
book_order = order[index]
print(index)
if order[index]['side'] == 'buy':
book_prices = self.bid_prices
book = self.bid_book
else: #order[index]['side'] == 'sell'
book_prices = self.ask_prices
book = self.ask_book
if order[index]['price'] in book_prices:
book[order[index]['price']]['num_orders'] += 1
book[order[index]['price']]['size'] += order[index]['quantity']
book[order[index]['price']]['order_ids'].append(index)
book[order[index]['price']]['orders'][index] = book_order
else:
bisect.insort(book_prices, order[index]['price'])
book[order[index]['price']] = {'num_orders': 1, 'size': order[index]['quantity'],'order_ids':
[index],
'orders': {index: book_order}}
def confirm_trade(self,order_id, timestamp, order_quantity, order_price, order_side):
trader = order_id.partition('_')[0]
self.confirm_traded.append({ 'trader': trader,'quantity': order_quantity, 'side': order_side,
'price': order_price,
'status': 'Successful'})
return self.confirm_traded
def process_trade_orders(self, order):
self.traded = False
index = list(order.keys())[0]
if order[index]['side'] == 'buy':
book = self.ask_book
if order[index]['price'] in self.ask_prices:
remainder = order[index]['quantity']
while remainder > 0:
book_order_id = book[order[index]['price']]['order_ids'][0]
book_order = book[order[index]['price']]['orders'][book_order_id]
if remainder >= book_order['quantity']:
self.trade_book.append({'order_id': book_order_id, 'timestamp': order[index]['timestamp'],
'price': order[index]['price'],
'quantity': order[index]['quantity'], 'side': book_order['side']})
self.confirm_trade(index, order[index]['timestamp'], order[index]['quantity'], order[index]['price'], order[index]['side'])
self.traded = True
remainder = remainder - book_order['quantity']
self.save_historty_orders(index, order[index])
break
else:
self.traded = True
self.trade_book.append({'order_id': index, 'timestamp': order[index]['timestamp'],
'price': order[index]['price'],
'quantity': order[index]['quantity'], 'side': order[index]['side']})
self.confirm_trade(index, order[index]['timestamp'], order[index]['quantity'], order[index]['price'], order[index]['side'])
self.save_historty_orders(index, order[index])
break
else:
self.add_order_to_book(order)
self.save_historty_orders(index, order[index])
else: #order['side'] == 'sell'
book = self.bid_book
if order[index]['price'] in self.bid_prices:
remainder = order[index]['quantity']
while remainder > 0:
book_order_id = book[order[index]['price']]['order_ids'][0]
book_order = book[order[index]['price']]['orders'][book_order_id]
if remainder >= book_order['quantity']:
self.trade_book.append({'order_id': book_order_id, 'timestamp': order[index]['timestamp'],
'price': order[index]['price'],
'quantity': order[index]['quantity'], 'side': order[index]['side']})
self.traded = True
self.confirm_trade(index, order[index]['timestamp'], order[index]['quantity'], order[index]['price'], order[index]['side'])
remainder = remainder - book_order['quantity']
self.save_historty_orders(index, order[index])
break
else:
self.traded = True
self.trade_book.append({'order_id': book_order_id, 'timestamp': order[index]['timestamp'],
'price': order[index]['price'],
'quantity': order[index]['quantity'], 'side': order[index]['side']})
self.confirm_trade(index, order[index]['timestamp'], order[index]['quantity'], order[index]['price'], order[index]['side'])
self.save_historty_orders(index, order[index])
break
else:
self.add_order_to_book(order)
self.save_historty_orders(index, order[index])
This class process I create object in my app.py and call the function process_trade_orders in that inside a function processing():
app = Flask(__name__)
app.config['DEBUG'] = True
newUser = Process()
succorder = Success()
#sched = BackgroundScheduler()
def generate_orderid():
num = 0
while num < 1000:
yield num
num = num + 1
genid = generate_orderid()
proc = Process()
sucorder = Success()
#Processing orders to find if they have a match
def processing():
get_orders_data = proc.get_userdata()
print(get_orders_data)
print("\n")
for data in get_orders_data['orders']:
index = list(data.keys())[0]
if data[index]['status'] == 'Successful':
sucorder.add_trader_orders(data[index],index)
else:
proc.process_trade_orders(data)
# sched = BackgroundScheduler()
# sched.add_job(func = processing, trigger="interval", seconds = 2)
# sched.start()
I did use APSbackground-scheduler for the same but I want to use thread for it. I was thinking of running a main thread in infinite loop as a daemon and use worker thread to run this function processing() in app.py which will be called after every few seconds to check if there are any successful order it will return the value to the main thread and those list of dict every new one I can return a response or some other way to the user about this successful order getting matched.
Note that this will be running is short intervals like 5 seconds and multiple add orders will be added and will be continuously running the checks asynchronously so I am not sure how will I return those values. I am just confused so if anyone can help me will be grateful.
If you want to make a threaded function that runs in background, just use the threading module, like this:
from threading import Thread
def bg_func():
doSomething
t = Thread(target=bg_func)
t.start() # will start the function and continue, even if the function still runs
doSomethingelseAtSameTime # runs with bg_func
You can also have multiple background threads.
Check the documentation for more info.

How to read the next page on API using python iterator?

There is an API that only produces one hundred results per page. I am trying to make a while loop so that it goes through all pages and takes results from all pages, but it does not work. I would be grateful if you could help me figure it out.
params = dict(
order_by='salary_desc',
text=keyword,
area=area,
period=30, # days
per_page=100,
page = 0,
no_magic='false', # disable magic
search_field='name' # available: name, description, company_name
)
response = requests.get(
BASE_URL + '/vacancies',
headers={'User-Agent': generate_user_agent()},
params=params,
)
response
items = response.json()['items']
vacancies = []
for item in items:
vacancies.append(dict(
id=item['id'],
name=item['name'],
salary_from=item['salary']['from'] if item['salary'] else None,
salary_to=item['salary']['to'] if item['salary'] else None,
currency = item['salary']['currency'] if item['salary'] else None,
created=item['published_at'],
company=item['employer']['name'],
area = item['area']['name'],
url=item['alternate_url']
))
I loop through the dictionary, if there is a result in the dictionary, I add +1 to the page parameter as an iterator:
while vacancies == True:
params['page'] += 1
Result in dictionary params ['page'] = zero remains (pages in API start at zero).
When calling params after starting the loop, the result is:
{'area': 1,
'no_magic': 'false',
'order_by': 'salary_desc',
'page': 0,
'per_page': 100,
'period': 30,
'search_field': 'name',
'text': '"python"'}
Perhaps I am doing the loop incorrectly, starting from the logic that while there is a result in the dictionary, the loop must be executed.
while vacancies == True: #
params['page'] += 1
will never evaluate to literal True regardless of it's contents. Python dict's; even thought they are Truthy They aren't True. You need to lessen the strictness of the statement.
if vacancies: # is truthy if it's len > 0, falsey otherwise
# Do something
Or you can explicitly check that it has content
if len(vacancies) > 0:
# Do something
This solves the problem of how to evaluate based on an object but doesn't solve the overall logic problem.
for _ in vacancies:
params["page"] += 1
# Does something for every item in vacancies
What you do each loop will depend on the problem and will require another question!
fixed below
params = dict(
order_by='salary_desc',
text=keyword,
area=area,
period=30, # days
per_page=100,
page = 0,
no_magic='false', # disable magic
search_field='name' # available: name, description, company_name
)
pages = []
while True:
params["page"] += 1
response = requests.get(BASE_URL + '/vacancies', headers={'User-Agent': generate_user_agent()}, params=params,)
items = response.json()['items']
if not items:
break
pages.append(items) # Do it for each page
Make vacancies for each page
results = []
for page in pages:
vacancies = []
for item in page:
vacancies.append(dict(
id=item['id'],
name=item['name'],
salary_from=item['salary']['from'] if item['salary'] else None,
salary_to=item['salary']['to'] if item['salary'] else None,
currency = item['salary']['currency'] if item['salary'] else None,
created=item['published_at'],
company=item['employer']['name'],
area = item['area']['name'],
url=item['alternate_url']
))
results.append(vacancies)
Results will be the fine list of all items.
vacancies is never True.
If you want to test on the boolean value of "vacancies" you could use bool(vacancies).
But with Python, you can use
while vacancies:
# some code logic
This way, Python will auto cast to bool your list.
If your list as something inside (len(your_list) > 0), bool(your_list) evaluatues to True, else it's False.
Also, instead of using dict(), you could write your dict this way:
params = {
'order_by': 'salary_desc',
'text':keyword,
'area': area,
'period': 30, # days
'per_page': 100,
'page': 0,
'no_magic': 'false', # disable magic
'search_field': 'name' # available: name, description, company_name
}
which is more pythonic.

How do you model multiple arrival distributions?

Python:
I am simulating a call-centre with 2 types of incoming calls: Sales calls, and service calls.
These calls have different, independent distributions, which enter the same system.
I have function, arrivals which contains:
iat_sales = random.expovariate(1/3)
yield env.timeout(iat_sales)
I want to incorporate:
iat_service = random.triangular(0,0,6)
yield env.timeout(iat_service)
how can I yield each event simultaneously?
This is the solution I have come up with:
def arrival_list():
sales_time = 0 #sim time of sales arrival
service_time = 0 #sim time of service arrival
sales_list=[] #list of sequential sales arrivals [arrival time,'sales']
service_list=[] #list of sequential sales arrivals [arrival time,'service']
arrivals = [] #ordered list of arrivals (sales and service merged) [arrival time,arrival type,iat]
while sales_time < sim_end:
iat_sales = random.expovariate(sales_rate)
sales_time += iat_sales
sales=[sales_time,'sales']
sales_list.append(sales)
while service_time < sim_end:
iat_service = random.triangular(0,6,0) ####
service_time += iat_service
service=[service_time,'service']
service_list.append(service)
arrivals = sales_list + service_list
arrivals.sort()
arrivals[0].append(arrivals[0][0])
for i in range(len(arrivals)-1):
arrivals[i+1].append(arrivals[i+1][0]-arrivals[i][0])
return arrivals
As a reference, a simple implementation can be done like this, where a simulation is run indefinitely with 1 second intervals and calls are considered to arrive if their random values exceed some thresholds:
import random
import time
def generate_calls():
return random.expovariate(1/3), random.triangular(10, 20, 5)
def simulation(count, sales_acceptance, services_acceptance):
# run the simulation indefinitely
while True:
print('Time: {}'.format(count))
sales_call, services_call = generate_calls()
# calls arrive if the values exceed some thresholds
if sales_call > sales_acceptance:
print('Sales call arrived!')
if services_call > services_acceptance:
print('Services call arrived!')
time.sleep(1)
count += 1
simulation(1, 2, 13)
You can have three separate parallel processes.
1- One process for making Sales calls.
2- One process for making service calls.
3- One process for handling calls.
import simpy
import random
sim_end = 1000;
def generateSalesCall(env, call_pipe):
while env.now < sim_end:
# put call in the pipe
yield call_pipe.put("sales");
interval = random.expovariate(1/3);
yield env.timeout(interval);
def generateServiceCall(env, call_pipe):
while env.now < sim_end:
# put call in the pipe
yield call_pipe.put("service");
interval = random.triangular(0,6,0);
yield env.timeout(interval);
def handleCalls(env, call_pipe):
while(True):
call = yield call_pipe.get();
if call == "sales":
print(env.now, "sales call");
elif call == "service":
print(env.now, "service call");
env = simpy.Environment();
call_pipe = simpy.Store(env);
env.process(generateSalesCall(env, call_pipe));
env.process(generateServiceCall(env, call_pipe));
env.process(handleCalls(env, call_pipe));
env.run();

How to use python multiprocessing to get a global sum after traversing through http requests?

I'm trying to write an algorithm that traverses the entire collection of nodes and returns the sum of their rewards. Each reward should only be counted a single time. The input to the algorithm will be a URL for a node to begin with, such as http://fake.url/a.
Each get request of the URL will return a JSON like this:
{
"children":[
"http://fake.url/b",
"http://fake.url/c"
],
"reward":1
}
Here's what I've tried:
import multiprocessing
import requests
import json
my_q = multiprocessing.Queue()
my_list =['http://fake.url/']
reward_sum = 0
def enqueue(q):
for data in my_list:
q.put(data)
def get_it(q):
while not q.empty():
item = q.get()
print(item)
response = requests.get(item)
kids = json.loads(response.content)
print(f'URL: {item} --> {kids["reward"]}')
for kid in kids['children']:
print(kid)
q.put(kid)
p1 = multiprocessing.Process(target=enqueue, args=(my_q,))
p2 = multiprocessing.Process(target=get_it, args=(my_q,))
p1.start()
p2.start()
p1.join()
p2.join()
What works above:
I am using multiprocessing.
I am accessing the children and rewards correctly.
I am getting output like this:
http://fake.url/a
URL: http://fake.url/a --> 1
{'children': ['http://fake.url/b', 'http://fake.url/c'], 'reward': 1}
http://fake.url/b
http://fake.url/c
http://fake.url/b
URL: http://fake.url/b --> 2
{'children': ['http://fake.url/d', 'http://fake.url/e'], 'reward': 2}
http://fake.url/d
http://fake.url/e
http://fake.url/c
URL: http://fake.url/c --> 3
{'children': ['http://fake.url/f', 'http://fake.url/g'], 'reward': 3}
http://fake.url/f
http://fake.url/g
http://fake.url/d
URL: http://fake.url/d --> 4
{'reward': 4}
http://fake.url/e
URL: http://fake.url/e --> 5
{'reward': 5}
http://fake.url/f
URL: http://fake.url/f --> 6
{'children': ['http://fake.url/h'], 'reward': 6}
http://fake.url/h
http://fake.url/g
What are the problems I need help with:
How to keep track of the total reward sum in a global variable?
How to keep track of a global "seen" set, so I don't add duplicates to the total reward sum?
def get_it(q):
rewards_total = 0
seen = set()
while not q.empty():
item = q.get()
print(item)
if item in seen:
continue
seen.add(item)
response = requests.get(item)
kids = json.loads(response.content)
rewards_total += kids["reward"]
print(f'URL: {item} --> {kids["reward"]}')
for kid in kids['children']:
print(kid)
q.put(kid)
return rewards_total

How to get the latest offset from each partition using kafka-python?

I'm trying to get the latest offset (not committed offset) from each partition for a given topic.
from kafka import KafkaConsumer, TopicPartition
topic = 'test-topic'
broker = 'localhost:9092'
consumer = KafkaConsumer(bootstrap_servers=broker)
tp = TopicPartition(topic, 0) #1
consumer.assign([tp]) #2
consumer.seek_to_end(tp) #3
last_offset = consumer.position(tp) #4
for i in consumer.partitions_for_topic(topic):
tp = TopicPartition(topic, i)
consumer.assign([tp])
consumer.seek_to_end(tp)
last_offset = consumer.position(tp)
print(last_offset)
The preceding code does work and prints the offset of each partition. However, notice how I have the same 4 lines outside of the loop as well as inside of the loop. If I remove any of the lines #1 - #4 (the 4 lines directly preceding the for loop) I get the error:
File "check_kafka_offset.py", line 19, in
for i in consumer.partitions_for_topic(topic):
TypeError: 'NoneType' object is not iterable
Why do I need to have the 4 lines before the for loop?
You can use the end_offsets(partitions) function in that client to get the last offset for the partitions specified. Note that the returned offset is the next offset, that is the current end +1. Documentation here.
Edit: Example implementation:
from kafka import KafkaProducer, KafkaConsumer, TopicPartition
from kafka.errors import KafkaError
import json
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
BOOTSTRAP="""cluster:9092"""
API_KEY="""redacted"""
API_SECRET="""redacted"""
TOPIC="python-test"
consumer = KafkaConsumer(
group_id="my-group",
bootstrap_servers=[BOOTSTRAP],
security_protocol="SASL_SSL",
sasl_mechanism="PLAIN",
sasl_plain_username=API_KEY,
sasl_plain_password=API_SECRET,
value_deserializer=lambda m: json.loads(m.decode('ascii')),
auto_offset_reset='earliest'
)
PARTITIONS = []
for partition in consumer.partitions_for_topic(TOPIC):
PARTITIONS.append(TopicPartition(TOPIC, partition))
end_offsets = consumer.end_offsets(PARTITIONS)
print(end_offsets)
and end_offsets looks like this:
{TopicPartition(topic=u'python-test', partition=0): 5,
TopicPartition(topic=u'python-test', partition=1): 20,
TopicPartition(topic=u'python-test', partition=2): 0}
Here is a simple and well documented function:
from kafka import TopicPartition
def getTopicInfos(consumer, topic: str):
"""
Get topic's informations like partitions with their last offsets.
Example of result: {'topic': 'myTopic', 'partitions': ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']})
- Parameters:
consumer: A Kafka consumer.
topic: A topic name.
- Return:
The topic's informations.
"""
# Get topic-partition pairs
# E.g: [TopicPartition(topic='myTopic', partition=0), TopicPartition(topic='myTopic', partition=1)]
tp = [TopicPartition(topic, partition) for partition in consumer.partitions_for_topic(topic)]
# Get last offsets
# E.g: {TopicPartition(topic='myTopic', partition=0): 47, TopicPartition(topic='myTopic', partition=1): 98}
tplo = consumer.end_offsets(tp)
# Format partition-lastOffset pairs
# E.g: ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']
plo = ['{' + f'"partition": {item.partition}, "lastOffset": {tplo.get(item)}' + '}' for item in tplo]
# Concat topic with partition-lastOffset pairs
# E.g: {'topic': 'myTopic', 'partitions': ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']})
tplo = {"topic": topic, "partitions": plo}
# Return the result
return tplo

Categories

Resources