whay multi thread worse than single thread? [duplicate] - python

This question already has answers here:
Python: Why is threaded function slower than non thread
(2 answers)
Closed 2 years ago.
I need to implement search function for array of complex class but when I changed to multi thread, I figured out that it became worse than past!
So I test simple code and saw it is true.
My code:
import numpy as np
import threading
import time
class Test(object):
def __init__(self):
self.num_workers = 1
self.workers = []
self.poses = []
self.arr = None
def search(self, arr, val):
self.poses = []
for i, a in enumerate(arr):
if a == val:
self.poses.append(i)
return self.poses
def search_worker(self, val, ID):
search_len = int(len(self.arr) / self.num_workers)
prefix = ID * search_len
if ID == self.num_workers - 1:
search_len = int(len(self.arr) - prefix)
for i in range(search_len):
if self.arr[prefix + i] == val:
self.poses.append(i)
def search_multi_thread(self, arr, val):
self.arr = arr
self.poses = []
self.num_workers = 5
for i in range(self.num_workers):
worker = threading.Thread(target=self.search_worker, args=(val, i,))
worker.start()
self.workers.append(worker)
for i in range(self.num_workers):
self.workers[i].join()
return self.poses
if __name__ == '__main__':
t = Test()
sample = np.random.randint(1000, size=50000000)
t1 = time.perf_counter()
res = t.search(sample, 65)
t2 = time.perf_counter()
print(F'Elapsed time to search = {t2 - t1}')
t1 = time.perf_counter()
res = t.search_multi_thread(sample, 65)
t2 = time.perf_counter()
print(F'Elapsed time to search with multiple thread = {t2 - t1}')
result :
Elapsed time to search = 13.291269699999999
Elapsed time to search with multiple thread = 17.8231911
Environment:
OS = windows 10
python = 3.7.7
CPU = Intel core i7 6700HQ
Whats I wrong?
How can I solve this problem?
(I read about multiprocessing but it seems that each process has different stack so they cant access to single array)

Note, while working with threads, one thing to be kept in mind is that threads tend to increase the efficiency of your program when there are possibilities of the processr sitting idle (whatever be the cause like i/o, sleeps, user interactions, etc.) during the processing of your work. If this is not the case, then the overhead of thread switching will simply further degrade the performance of your program.
In your case, there are very low possibilities of processor sitting idle for significant time. Moreover, you are using far too many threads for this task. So, the thread switching simply overweights whatever performance you achieve from the use of threads. Hence, the performance of your program degrades further.

Massive performance boost after using multiprocessing.
import numpy as np
import time
import multiprocessing
class Test(object):
def __init__(self):
self.num_workers = 1
self.workers = []
self.poses = []
self.arr = None
def search(self, arr, val):
self.poses = []
for i, a in enumerate(arr):
if a == val:
self.poses.append(i)
return self.poses
def search_worker(self, val, ID):
search_len = int(len(self.arr) / self.num_workers)
prefix = ID * search_len
if ID == self.num_workers - 1:
search_len = int(len(self.arr) - prefix)
for i in range(search_len):
if self.arr[prefix + i] == val:
self.poses.append(i)
def search_multi_thread(self, arr, val):
self.arr = arr
self.poses = []
self.num_workers = 5
for i in range(self.num_workers):
worker = multiprocessing.Process(target=self.search_worker, args=(val, i,))
worker.start()
self.workers.append(worker)
for i in range(self.num_workers):
self.workers[i].join()
return self.poses
if __name__ == '__main__':
t = Test()
sample = np.random.randint(1000, size=50000000)
t1 = time.perf_counter()
res = t.search(sample, 65)
t2 = time.perf_counter()
print(F'Elapsed time to search = {t2 - t1}')
t1 = time.perf_counter()
res = t.search_multi_thread(sample, 65)
t2 = time.perf_counter()
print(F'Elapsed time to search with multiprocessing = {t2 - t1}')

Related

How to test a set of conditions on a large number of variants in Python

In Python, I have a lot of possible permutations of which only one is valid (True).
The permutations are generated by code similarly like the code below.
My question: can this be done faster ? How ?
from itertools import permutations, product
from datetime import datetime
def variant_generator(act, prop):
for this_permutation in product(permutations(range(act)), repeat=prop):
yield this_permutation
def is_valid_permutation(this_variant):
if this_variant[1][3] == this_variant[2][2]:
if this_variant[0][0] == this_variant[1].index(3):
if this_variant[1][4] == 0:
if this_variant[2][0] == 4:
if this_variant[0][4] == this_variant[2][4]:
if this_variant[0][1] == 1:
# ... condition(n-1) ...
# ... condition(n) ... enough to result in ONE variant
return True
timerstart = datetime.now()
variants = variant_generator(6, 3)
for this_variant in variants:
if is_valid_permutation(this_variant):
print('valid: ', this_variant)
timerend = datetime.now()
print('it took: {} seconds'.format((timerend-timerstart).seconds))
This is not answer to speed up
I just wanted to point out that below is equivalent code, you can reduce depth of indentation. If above was intentional, then I can delete my answer
from itertools import permutations, product
from datetime import datetime
def variant_generator(act, prop):
for this_permutation in product(permutations(range(act)), repeat=prop):
yield this_permutation
def is_valid_permutation(this_variant):
if this_variant[1][3] != this_variant[2][2]:
return False
if this_variant[0][0] != this_variant[1].index(3):
return False
if this_variant[1][4] != 0:
return False
if this_variant[2][0] != 4:
return False
if this_variant[0][4] != this_variant[2][4]:
return False
if this_variant[0][1] != 1:
return False
# ... condition(n-1) ...
# ... condition(n) ... enough to result in ONE variant
return True
timerstart = datetime.now()
variants = variant_generator(6, 3)
for this_variant in variants:
if is_valid_permutation(this_variant):
print('valid: ', this_variant)
timerend = datetime.now()
print('it took: {} seconds'.format((timerend-timerstart).seconds))
Here's a solution that utilises multiprocessing. On my system this runs significantly faster than the code in question. Results will vary per platform. The value of BLOCK will affect performance in different ways per platform. The value shown here appears to be optimal on my machine. My total execution time is ~120s
from itertools import permutations, product
from datetime import datetime
import multiprocessing as mp
def variant_generator(act, prop):
for this_permutation in product(permutations(range(act)), repeat=prop):
yield this_permutation
def ivp(v):
vr = []
for this_variant in v:
if this_variant[1][3] == this_variant[2][2]:
if this_variant[0][0] == this_variant[1].index(3):
if this_variant[1][4] == 0:
if this_variant[2][0] == 4:
if this_variant[0][4] == this_variant[2][4]:
if this_variant[0][1] == 1:
# ... condition(n-1) ...
# ... condition(n) ... enough to result in ONE variant
vr.append(f'{str(this_variant)} is valid')
return vr
def getresults(results):
for r in results:
for _r in r.get():
print(_r)
def main():
timerstart = datetime.now()
with mp.Pool() as pool:
variants = variant_generator(6, 3)
BLOCK = 100_000
V = [None] * BLOCK
results = []
i = 0
for this_variant in variants:
V[i] = this_variant
if (i := i + 1) % BLOCK == 0:
results.append(pool.apply_async(ivp, [V]))
i = 0
results.append(pool.apply_async(ivp, [V[:i]]))
getresults(results)
timerend = datetime.now()
print(f'Duration: {timerend - timerstart}')
if __name__ == '__main__':
main()
Found a solution with 'Yield' by returning a solution which checks for a condition, and if True, passes it to the next condition, one by one. This saves memory and speed, because at any given moment in time, one solution is considered.
In pseudocode:
subroutine_to_check_for_clue_1 (candidates):
for each candidate in candidates:
if the clue applies to this candidate:
yield this candidate
subroutine_to_check_for_clue_n-1 (candidates):
for each candidate in candidates:
if the clue applies to this candidate:
yield this candidate
subroutine_to_check_for_clue_n (candidates):
for each candidate in candidates:
if the clue applies to this candidate:
yield this candidate
main routine:
candidate_generator:
solutions = (subroutine_n(subroutine_n-1(subroutine_... (subroutine_1(candidate))))

Issue with trying to dynamically resize a python multiprocessing array

Edit: Added Minimal Reproducible Problem Example
I'm trying to create a wrapper class to emulate an ArrayList that is thread-safe. All methods seem to work fine, but the resizeBackingArray method isn't working. I've gathered that this is because I'm reassigning the value that belongs to the object, not the class, so the resized backing array won't be visible to all threads.
What would be the best way to fix this? I know the multiprocessing.manager has functionality to use a list, but I was hoping to use the faster ctype array.
You'll notice in the output that the array has to be resized since 15 elements are added to an array with an initial capacity of 10. However, after resizing, the copy of the list available to the main.py has a length of 10, while the size has been incremented to 15.
Output:
Resizing
Size of Array: 15
Length of Array: 10
Done!
From main.py
def addToList(aList):
for x in range(0, 15):
aList.add(x)
if __name__ == "__main__":
import sys
safeArray = ThreadSafeArray.SafeArrayList()
p3 = multiprocessing.Process(target=addToList, args=(safeArray,))
p3.start()
p3.join()
print("Size of Array: " + str(safeArray.getSize()))
print("Length of Array: " + str(safeArray.lengthOfBackingArray()))
print("Done!")
From ThreadSafeArray.py
import multiprocessing
import sys, traceback
class SafeArrayList:
def __init__(self, initSize = 10):
self.arr = multiprocessing.Array('i', initSize)
self.size = multiprocessing.Value('i', 0)
self.lock = multiprocessing.RLock()
#Doubles the size of the array
def resizeBackingArray(self):
#print("resizing")
with self.lock:
newArr = multiprocessing.Array('i', self.size.value * 2)
for num in range(0,self.size.value):
newArr[num] = self.arr[num]
self.arr = newArr
def add(self, num):
with self.lock:
try:
if self.size.value == len(self.arr):
print("Resizing")
self.resizeBackingArray()
if self.size.value == len(self.arr):
print("ERROR")
self.arr[self.size.value] = num
self.size.value = self.size.value + 1
except:
print ('-'*60)
print("Error")
print(sys.exc_info())
traceback.print_exc(file=sys.stdout)
print ('-'*60)
print("Printing " + str(num))
print("Size = " + str(self.size.value) + "\nLength = " + str(len(self.arr)))
self.printArray()
def get(self, index):
with self.lock:
if index < 0 or index >= self.size.value:
return None
else:
return self.arr[index]
def getSize(self):
return self.size.value
def lengthOfBackingArray(self):
return len(self.arr)
def printArray(self):
print("Printing Array")
for x in range(0, self.size.value):
print(str(self.arr[x]))

Passing a variable from another function in a class to Pool

The below code simulates a stock price and calculates its payoff. I am trying to use multiprocessing to speed up the simulations. The problem is that in CallUpAndOut where I have pool.map, I am not sure how to access total from simulations
I have tried several things like self.Simulations.Total or self.total but it doesn't work.
import numpy as np
from multiprocessing import Pool
import time
class PricingSimulatedBarrierOption:
def __init__(self, spot, strike, barrier, rate, sigma, time, sims, steps):
self.spot = spot
self.strike = strike
self.barrier = barrier
self.rate = rate
self.sigma = sigma
self.time = time
self.sims = sims
self.steps = steps
self.dt = self.time / self.steps
def Simulations(self):
total = np.zeros((self.sims,self.steps+1),float)
pathwiseS= np.zeros((self.steps+1),float)
for j in range(self.sims):
pathwiseS[0] =self.spot
total[j,0] = self.spot
for i in range(1,self.steps+1):
phi = np.random.normal()
pathwiseS[i] = pathwiseS[i-1]*(1+self.rate*self.dt+self.sigma*phi*np.sqrt(self.dt))
total[j,i]= pathwiseS[i]
return total.reshape(self.sims, self.steps+1)
def CallUpAndOut(self):
start_time = time.time()
p = Pool()
getpayoff = p.map(self.Simulations(),self.total) ###How to pass total here?
p.close()
p.join()
end_time = time.time()-start_time
print(end_time)
# getpayoff = self.Simulations()
callpayoff = np.zeros((self.sims),float)
for j in range(self.sims):
if max(getpayoff[j,])>=self.barrier:
callpayoff[j] = 0
else:
callpayoff[j] = max(getpayoff[j,self.steps-1]-self.strike,0)
return np.exp(-self.rate*self.time)*np.average(callpayoff)
c = PricingSimulatedBarrierOption(100,100,170,0.05,0.2,1,10000,252)
print(c.CallUpAndOut())
In function definition add parameter see below example:
def CallUpAndOut(self, total):
And pass array of total values in map see below example:
total = [1,2,3]
getpayoff = p.map(self.Simulations,total)
To work this I had to move the declaration outside. Below code is now able to accept variable in the Pool function.
import numpy as np
from multiprocessing import Pool
import time
class PricingSimulatedBarrierOption:
def __init__(self, spot, strike, barrier, rate, sigma, time, sims, steps):
self.spot = spot
self.strike = strike
self.barrier = barrier
self.rate = rate
self.sigma = sigma
self.time = time
self.sims = sims
self.steps = steps
self.dt = self.time / self.steps
self.pathwiseS= np.zeros((self.steps+1),float)
def Simulations(self):
print("Called")
total = np.zeros((self.sims,self.steps+1),float)
self.pathwiseS= np.zeros((self.steps+1),float)
for j in range(self.sims):
self.pathwiseS[0] =self.spot
total[j,0] = self.spot
for i in range(1,self.steps+1):
phi = np.random.normal()
self.pathwiseS[i] = self.pathwiseS[i-1]*(1+self.rate*self.dt+self.sigma*phi*np.sqrt(self.dt))
total[j,i]= self.pathwiseS[i]
return total.reshape(self.sims, self.steps+1)
def CallUpAndOut(self):
start_time = time.time()
p = Pool()
getpayoff = p.map(self.Simulations(),self.pathwiseS)
p.close()
p.join()
end_time = time.time()-start_time
print(end_time)
# getpayoff = self.Simulations()
callpayoff = np.zeros((self.sims),float)
for j in range(self.sims):
if max(getpayoff[j,])>=self.barrier:
callpayoff[j] = 0
else:
callpayoff[j] = max(getpayoff[j,self.steps-1]-self.strike,0)
return np.exp(-self.rate*self.time)*np.average(callpayoff)

Why does this loop take 5 seconds to execute in the django server but only 0.8 in the shell?

I'm trying to figure out a timing discrepancy in my python django project.
When I run the function in the shell, it only takes about a second to get the return object and return it.
When I run it by using an api call the timing says it takes over 5 seconds to loop through and serialize the objects.
Below is the function in question:
def serialize_particular(id=-1, all_available=False, page=0, maxOn=40):
if id != -1:
b = MyModel.objects.filter(id=id)
if len(b) == 0:
return []
else:
b = b[0]
data = []
if all_available:
oThings = b.things
else:
oThings = b.things_available
count = len(oThings)
start = maxOn * page
end = start + maxOn
if start > count:
oThings = oThings.filter(id=-1)
nextPage = -1
else:
if end > count:
end = count
page = -2
oThings = oThings[start:end]
nextPage = page + 1
for inv in oThings:
data.append({})
data[-1]['id'] = inv.id
data[-1]['available'] = inv.available
data[-1]['integer'] = inv.integer
data[-1]['subObject'] = {}
data[-1]['subObject']['name'] = inv.subObject.name
data[-1]['subObject']['category'] = inv.subObject.category
data[-1]['subObject']['subcategory'] = inv.subObject.subcategory
data[-1]['subObject']['listOfSubs'] = []
for subsub in inv.subObject.listOfSubs.all():
data[-1]['subObject']['listOfSubs'].append({})
data[-1]['subObject']['listOfSubs'][-1]['name'] = subsub.name
data[-1]['subObject']['listOfSubs'][-1]['category'] = subsub.category
data[-1]['subObject']['listOfSubs'][-1]['subcategory'] = subsub.subcategory
return data, nextPage
Below is my timing function that I use in the server:
class Timing:
active = True
def __init__(self, text):
if self.active:
self.text = text
self.start = time.time()
def end(self):
if self.active:
self.end = time.time()
self.total = self.end - self.start
print("[%r] %2.6f s" %(self.text, self.total))
Below is the relevant code I use in the server api call:
T = Timing("API.things: Getting MyModel things")
s = serialize_particular(1, False, 0, 50000)
data = s[0]
nextPage = s[1]
T.end()
Below is the output of the timing in the server's log:
['API.things: Getting MyModel things'] 5.652001 s
Below is the output of my shell:
>>> def t(id):
... T = Timing("API.things: Getting MyModel things")
... z = api.serialize_business_inventory(id, False, 0, 50000)
... T.end()
... return z
>>> s = t(1)
['API.things: Getting MyModel things'] 0.469001 s
Does anyone have any idea why this is happening?

Wrapping code in main function makes it run much faster [duplicate]

This question already has answers here:
Why does Python code run faster in a function?
(3 answers)
Closed 7 years ago.
Trying to understand this phenomenon in Python. When I wrap my code into a function, it speeds up by more than 30%!!! So far I failed to google out any reasonable explanation.
For example:
import sys
sys.stdin = open("dd.txt")
import cProfile
import time
t0 = time.time()
def main():
from collections import defaultdict
n, T = map(int, raw_input().split())
tree = defaultdict(lambda: set())
root = None
for _ in xrange(n - 1):
a, b = map(int, raw_input().split())
tree[a].add(b)
tree[b].add(a)
if not root:
root = a
count = 0
stack = [root]
links = dict()
links[root] = 0
mem = dict()
while stack:
node = stack.pop()
path = list()
path.append(node)
lnk = links[node]
while lnk:
if lnk not in mem:
if abs(lnk - node) <= T:
count += 1
path.append(lnk)
lnk = links[lnk]
else:
path.extend(mem[lnk])
for el in mem[lnk]:
if abs(el - node) <= T:
count += 1
break
#print node, path
plen = len(path)
mem[node] = path
for next_node in tree[node]:
if plen <= 1 or next_node != path[1]:
links[next_node] = node
stack.append(next_node)
print count
main()
print time.time() - t0
This prints running time as 2.5 seconds, but this:
import sys
sys.stdin = open("dd.txt")
import cProfile
import time
t0 = time.time()
#def main():
from collections import defaultdict
n, T = map(int, raw_input().split())
tree = defaultdict(lambda: set())
root = None
for _ in xrange(n - 1):
a, b = map(int, raw_input().split())
tree[a].add(b)
tree[b].add(a)
if not root:
root = a
count = 0
stack = [root]
links = dict()
links[root] = 0
mem = dict()
while stack:
node = stack.pop()
path = list()
path.append(node)
lnk = links[node]
while lnk:
if lnk not in mem:
if abs(lnk - node) <= T:
count += 1
path.append(lnk)
lnk = links[lnk]
else:
path.extend(mem[lnk])
for el in mem[lnk]:
if abs(el - node) <= T:
count += 1
break
#print node, path
plen = len(path)
mem[node] = path
for next_node in tree[node]:
if plen <= 1 or next_node != path[1]:
links[next_node] = node
stack.append(next_node)
print count
#main()
print time.time() - t0
Simply moving code out of main() function makes it run 3.5 seconds instead of 2.5.
What can be the reason for this???
The difference is that Python uses different bytecode operations for accessing the local variables of a function and the global variables of a module. The LOAD_FAST opcodes used for accessing local variables takes a numeric index and performs a quick array lookup to retrieve the value. The LOAD_NAME and LOAD_GLOBAL opcodes used for accessing global variables take a name and perform a hash table lookup (possibly in multiple hash tables) to retrieve the value.
By wrapping your code in a function, you're effectively converting all of your variables from globals into locals, which enables much faster access to them.

Categories

Resources