How to use parallel processing to call the same function multiple times? - python

How to make this task much faster to be finished? The 3 calls of generate_ngrams_from_file() can be done in parallel? Just get into python and don't know how to make it faster. I think multiprocessing or threading should be doing the job, but no idea of how to do it. This looks like a typical task can be done concurrently to use multiple cores on my Mac machine.
def tokenize(text):
return [token for token in text.split(' ')]
def generate_ngrams(text, n):
tokens = tokenize(text)
ngrams = zip(*[tokens[i:] for i in range(n)])
return [''.join(ngram) for ngram in ngrams]
def generate_ngrams_from_file(input, out, n):
count = 0
with open(input, 'r') as f:
for line in f:
count += 1
if line:
ngrams = generate_ngrams(line, n)
if n == 2:
bigrams.update(ngrams)
elif n == 3:
trigrams.update(ngrams)
elif n == 4:
fourgrams.update(ngrams)
elif n == 5:
fourgrams.update(ngrams)
print("Ngram done!")
if __name__ == "__main__":
start = time.time()
input_file = 'bigfile.txt'
output_3_tram = '3gram.txt'
output_4_tram = '4ngram.txt'
output_5_tram = '5ngram.txt'
print('Generate trigram: ')
generate_ngrams_from_file(input_file, output_3_tram, 3)
print("Generate fourgrams: ")
generate_ngrams_from_file(input_file, output_4_tram, 4)
print("Generate fivegrams: ")
generate_ngrams_from_file(input_file, output_5_tram, 5)
end = time.time()
mytime(start, end)

Multithreading in Python is not a very good idea because of the Global Interpreter Lock feature of Python. You can read about it here https://www.geeksforgeeks.org/what-is-the-python-global-interpreter-lock-gil/. Multiprocessing is a better option to make your programs faster. You can put the generate_ngrams() function inside the Process class of multiprocessing module. Read about the Process class at https://docs.python.org/2/library/multiprocessing.html. Process class is recommended as it is faster than both pool.apply() and pool.apply_async()

Related

One of methods doesn't work correctly when i call it

I need to make two checks in log files and display the result. Separately methods work correctly, but when I run all code method hit_unique_check always return "PASS: All hits are unique.". For two of three .log files this result is incorrect.
import os
class ReadFiles:
def __init__(self):
self.current_file = ""
self.shoot_from = "Shoot from"
self.hit_player = "Hit player"
def equally_check(self):
shoot_from_list = []
hit_player_list = []
for line in self.current_file:
if self.shoot_from in line:
shoot_from_list.append(line)
elif self.hit_player in line:
hit_player_list.append(line)
if len(shoot_from_list) == len(hit_player_list):
print(" PASS: Shoots and hits are equal.\n")
else:
print(" FAIL: Shoots and hits are NOT equal.\n")
def hit_unique_check(self):
unique_hit_list = []
duplicates = []
for line in self.current_file:
if self.hit_player in line:
unique_hit_list.append(line)
else:
continue
for i in unique_hit_list:
if unique_hit_list.count(i) > 1:
duplicates.append(i)
print(i)
else:
continue
if len(duplicates) < 1:
print(" PASS: All hits are unique.\n")
else:
print(" FAIL: This hits are duplicated.\n")
def run(self):
for file in os.listdir():
if file.endswith(".log"):
print(f"Log file - {file}")
self.current_file = open(f"{file}", 'rt')
print(self.current_file.readlines, f"")
self.equally_check()
self.hit_unique_check()
self.current_file.close()
if __name__ == "__main__":
run = ReadFiles()
run.run()
I run my python code, but result always the same: "PASS: All hits are unique.". For some files it must be "FAIL: This hits are duplicated.". I'm not sure that problem in the method hit_unique_check, and have no idea what to do.
Can you explain me, how I can make this method working correctly not only separately?
Consider this organization. Each function has one task, to evaluate and return its result. It's up to the caller to decide what to do with the result. Also note that I'm using counters instead of lists, since you don't really care what the lists contain. Also note the use of defaultdict, to avoid having to do repeated searches of your hit list.
import os
from collections import defaultdict
class ReadFiles:
def __init__(self):
self.shoot_from = "Shoot from"
self.hit_player = "Hit player"
def equally_check(self, lines):
shoot_from = 0
hit_player = 0
for line in lines:
if self.shoot_from in line:
shoot_from += 1
elif self.hit_player in line:
hit_player += 1
return shoot_from == hit_player
def hit_unique_check(self, lines):
unique_hit_list = defaultdict(int)
for line in lines:
if self.hit_player in line:
unique_hit_list[line] += 1
duplicates = 0
for k,v in unique_hit_list.items()
if v > 1:
duplicates += 1
print(k)
return not duplicates
def run(self):
for filename in os.listdir():
if filename.endswith(".log"):
print(f"Log file - {filename}")
lines = open(filename, 'rt').readlines()
print(lines)
if self.equally_check(lines):
print(" PASS: Shoots and hits are equal.\n")
else:
print(" FAIL: Shoots and hits are NOT equal.\n")
if self.hit_unique_check(lines):
print(" PASS: All hits are unique.\n")
else:
print(" FAIL: This hits are duplicated.\n")
if __name__ == "__main__":
run = ReadFiles()
run.run()
You could even replace the loop in hit_unique_check with a counter:
from collections import Counter
...
def hit_unique_check(self,lines):
unique_hit_list = Counter(lines)
for k,v in unique_hit_list,items():
...

Python Monty Hall: Multiprocessing slower than direct processing

I am trying out multiprocessing for my Monty Hall game simulation for improved performance. The game is payed 10mm times and takes ~17 seconds when directly run, however, my multiprocessing implementation is taking significantly longer to run. I am clearly doing something wrong but I can't figure out what.
import multiprocessing
from MontyHall.game import Game
from MontyHall.player import Player
from Timer.timer import Timer
def doWork(input, output):
while True:
try:
f = input.get(timeout=1)
res = f()
output.put(res)
except:
break
def main():
# game setup
player_1 = Player(True) # always switch strategy
game_1 = Game(player_1)
input_queue = multiprocessing.Queue()
output_queue = multiprocessing.Queue()
# total simulations
for i in range(10000000):
input_queue.put(game_1.play_game)
with Timer('timer') as t:
# initialize 5 child processes
processes = []
for i in range(5):
p = multiprocessing.Process(target=doWork, args=(input_queue, output_queue))
processes.append(p)
p.start()
# terminate the processes
for p in processes:
p.join()
results = []
while len(results) != 10000000:
r = output_queue.get()
results.append(r)
win = results.count(True) / len(results)
loss = results.count(False) / len(results)
print(len(results))
print(win)
print(loss)
if __name__ == '__main__':
main()
This is my first post. Advice on posting etiquette is also appreciated. Thank you.
Code for the Classes:
class Player(object):
def __init__(self, switch_door=False):
self._switch_door = switch_door
#property
def switch_door(self):
return self._switch_door
#switch_door.setter
def switch_door(self, iswitch):
self._switch_door = iswitch
def choose_door(self):
return random.randint(0, 2)
class Game(object):
def __init__(self, player):
self.player = player
def non_prize_door(self, door_with_prize, player_choice):
"""Returns a door that doesn't contain the prize and that isn't the players original choice"""
x = 1
while x == door_with_prize or x == player_choice:
x = (x + 1) % 3 # assuming there are only 3 doors. Can be modified for more doors
return x
def switch_function(self, open_door, player_choice):
"""Returns the door that isn't the original player choice and isn't the opened door """
x = 1
while x == open_door or x == player_choice:
x = (x + 1) % 3 # assuming there are only 3 doors. Can be modified for more doors
return x
def play_game(self):
"""Game Logic"""
# randomly places the prize behind one of the three doors
door_with_prize = random.randint(0, 2)
# player chooses a door
player_choice = self.player.choose_door()
# host opens a door that doesn't contain the prize
open_door = self.non_prize_door(door_with_prize, player_choice)
# final player choice
if self.player.switch_door:
player_choice = self.switch_function(open_door, player_choice)
# Result
return player_choice == door_with_prize
Code for running it without multiprocessing:
from MontyHall.game import Game
from MontyHall.player import Player
from Timer.timer import Timer
def main():
# Setting up the game
player_2 = Player(True) # always switch
game_1 = Game(player_2)
# Testing out the hypothesis
with Timer('timer_1') as t:
results = []
for i in range(10000000):
results.append(game_1.play_game())
win = results.count(True) / len(results)
loss = results.count(False) / len(results)
print(
f'When switch strategy is {player_2.switch_door}, the win rate is {win:.2%} and the loss rate is {loss:.2%}')
if __name__ == '__main__':
main()
As you did not give the full code that we can run locally, I can only speculate. My guess is that you are passing an object(a method from your game) to other processes so pickling and unpickling took too much time. Unlike multithreading where you can "share" data, in multiprocessing, you need to pack the data and send to the other process.
However, there's a rule I always follow when I try to optimize my code - profile before optimizing! It would be much better to KNOW what's slow than GUESS.
It's a multiprocessing program so there are not a lot of options in the market. You could try viztracer which supports multiprocessing.
pip install viztracer
viztracer --log_multiprocess your_program.py
It will generate a result.html that you can open with chrome. Or you can just do
vizviewer result.html
I would suggest to reduce the iteration number so you can have a view of the whole picture(because viztracer uses a circular buffer and 10 million iterations will definitely overflow). But, you can still get the last piece of your code executing if you don't, which should be helpful enough for you to figure out what's going on.
I used viztracer as you gave the whole code.
This is one of your iteration in your worker process. As you can tell, the actual working part is very small(the yellow-ish slice in the middle p...). Most of the time has been spent on receiving and putting data, which eliminates the advantage of parallelization.
The correct way to do this is do it in batches. Also as this game does not actually require any data, you should just sent "I want to do it 1000 times" to the process, and let it do it, instead of sending the method one by one.
There's another interesting problem that you can easily find with viztracer:
This is the big picture of your worker process. Notice the large "nothing" in the end? Because your worker needs a timeout to finish, and that's when they are waiting. You should come up with a better idea to elegantly finish your worker process.
Updated my code. I fundamentally misunderstood the multiprocessing method.
def do_work(input, output):
"""Generic function that takes an input function and argument and runs it"""
while True:
try:
f, args = input.get(timeout=1)
results = f(*args)
output.put(results)
except:
output.put('Done')
break
def run_sim(game, num_sim):
"""Runs the game the given number of times"""
res = []
for i in range(num_sim):
res.append(game.play_game())
return res
def main():
input_queue = multiprocessing.Queue()
output_queue = multiprocessing.Queue()
g = Game(Player(False)) # set up game and player
num_sim = 2000000
for i in range(5):
input_queue.put((run_sim, (g, num_sim))) # run sim with game object and number of simulations passed into
# the queue
with Timer('Monty Hall Timer: ') as t:
processes = [] # list to save processes
for i in range(5):
p = multiprocessing.Process(target=do_work, args=(input_queue, output_queue))
processes.append(p)
p.start()
results = []
while True:
r = output_queue.get()
if r != 'Done':
results.append(r)
else:
break
# terminate processes
for p in processes:
p.terminate()
# combining the five returned list
flat_list = [item for sublist in results for item in sublist]
print(len(flat_list))
print(len(results))

Extracting two or three smaller functions from main() to find errors more easily

I would like to extract shorter functions from the larger main function to make this more readable and easier to find errors without removing functionality
I was thinking of splitting it down the middle like shown with "def calc_mean():. However an issue is that data amongst other things is not defined in this function. How should I change these so the original program still works despite being divided into 2?
It is a never ending loop.
user_input calls main, then main calls user_input, then user_input calls main and so on.
FIX 1
Remove filename = input(user_input()) from main function and pass filename as an argument from user_input . In this case whole script should first call user_input function.
FIX 2
Remove filename = input(user_input()) with filename = user_input() adjust user_input function so it would only ask for user input and then return that input. In this case script should first call main function.
Also, in the bottom of the script it should be
if __name__ == '__main__': # not `main`
call_some_function()
Update
def user_input(): # adjust this if you need
filename = input()
return filename
def read_data(filename):
data = dict()
with open(filename, 'r') as h:
for line in h:
four_vals = line.split(',')
batch = four_vals[0]
if not batch in data:
data[batch] = []
data[batch] += [(float(four_vals[1]), float(four_vals[2]), float(four_vals[3]))]
return data
def calc_mean(sample):
if len(sample) == 0:
return
n = 0
x_sum = 0
for (x, y, val) in sample:
if x**2 + y**2 <= 1:
x_sum += val
n += 1
average = x_sum / n
return average
def main():
'''
This is the main body of the program.
'''
filename = user_input()
data = read_data(file_name)
for batch, sample in data.items():
average = calc_mean(sample)
if average is not None:
print(f"{batch}\t{average}")
else:
print(f"{batch}\t{No data}")

Python multi-threading two parallel loops

Let says I have two parallel block loops. What is the best way to run them in parallel using python. Currently I am experimenting with multi-threading using following program
#!/usr/bin/env python
import time
import serial
import os
from threading import Thread
ser = serial.Serial(port='/dev/ttyUSB0', baudrate=38400, timeout=None)
ser.flushInput()
ser.flushOutput()
def getstrings(port):
buf = bytearray()
while True:
b = port.read(1)
if b == b'\x02':
del buf[:]
elif b == b'\x03':
yield buf.decode('ascii')
else:
buf.append(b)
def tester():
while 1:
print('testing')
def values():
count = ""
tem = ""
hv = ""
counti = 0
temi = 0
hvi = 0
while 1:
for item in getstrings(ser):
#if len(item) >= 10:
# continue
if item[1] == "C":
count = item.split('C')[1]
counti=int(count[0:5])
if item[1] == "T":
tem = item.split('T')[1]
temi=int(tem[0:5])
if item[1] == "H":
hv = item.split('H')[1]
hvi = int(hv[0:5])/10
print ("HV="+str(hvi)+" "+"Count="+str(counti)+" "+"Temp="+str(temi))
t1 = Thread(target=values)
t2 = Thread(target=tester)
t1.start()
t2.start()
Only the second thread works. It doesn't print the values from second. This is the first time I am experimenting with multi-threading. Once, I understood how this will function then I intend to use this to design a GUI using Tkinter libraries. I want to use loop of my program along Tkinter main loop. Any suggestion where I might be making a mistakes.
Update:
Yes it thread 2 not thread 1. My mistakes sorry about that. But individually both threads work if I comments t1.start() or t2.start(). However, together only thread 2 prints the output.

Performing an action as python script closes

I was wondering if it was possible to perform an action at any given point in a basic python script, so say when it is close. I have the following code to find prime numbers (Just for fun)
number = 1
primelist = []
nonprime = []
while number < 1000:
number += 1
for i in range(number):
if i != 1 and i != number and i !=0:
if number%i == 0:
nonprime.append(number)
else:
primelist.append(number)
nonprimes = open("nonprimes.txt", "w")
for nonprime in set(primelist) & set(nonprime):
nonprimes.write(str(nonprime) + ", ")
nonprimes.close()
So basically i wanted to run the last part as the script is stopped. If this isn't possible is there a way where say i press "space" while the program is running and then it saves the list?
Cheers in advance :)
EDIT:
I've modified the code to include the atexit module as suggested, but it doesn't appear to be working. Here it is:
import time, atexit
class primes():
def __init__(self):
self.work(1)
def work(self, number):
number = 1
self.primelist = []
self.nonprime = []
while number < 20:
time.sleep(0.1)
print "Done"
number += 1
for i in range(number):
if i != 1 and i != number and i !=0:
if number%i == 0:
self.nonprime.append(number)
else:
self.primelist.append(number)
nonprimes = open("nonprimes.txt", "w")
for nonprime in set(self.primelist) & set(self.nonprime):
nonprimes.write(str(nonprime) + ", ")
nonprimes.close()
def exiting(self, primelist, nonprimelist):
primelist = self.primelist
nonprimelist = self.nonprime
nonprimes = open("nonprimes.txt", "w")
for nonprime in set(self.primelist) & set(self.nonprime):
nonprimes.write(str(nonprime) + ", ")
nonprimes.close()
atexit.register(exiting)
if __name__ == "__main__":
primes()
While I'm pretty certain the file object does cleanup and flushes the stuff to file when it is reclaimed. The best way to go about this is to use a with statement.
with open("nonprimes.txt", "w") as nonprimes:
for nonprime in set(primelist) & set(nonprime):
nonprimes.write(str(nonprime) + ", ")
The boiler plate code of closing the file and such is performed automatically when the statement ends.
Python has an atexit module that allows you to register code you want executed when a script exits:
import atexit, sys
def doSomethingAtExit():
print "Doing something on exit"
atexit.register(doSomethingAtExit)
if __name__ == "__main__":
sys.exit(1)
print "This won't get called"

Categories

Resources