Python + simPy: name 'move' is not defined - python

I am making a simulation using python 2.6 + Simpy, about the subway system. here's my code:
import sys
import random
from math import*
from math import ceil, log
from random import*
from random import random, uniform, seed, expovariate
from SimPy import*
from SimPy.Simulation import*
from math import ceil, log
totalusuarios = 0
cantgrupos=0
def triangulo(inf,sup,moda):
return random.triangular((inf),(sup),(moda))
def geometric(q):
if q == 1.0:
return 1
U = 1.0 - random.random()
G = int(ceil(log(U) / log(1.0 - q)))
return G
# A class that represents the process of generation of Groups (arrivals)
class Generador(Process):
def generar(self, prob,interarribo, porc_torniq, porc_taq, porc_maq, min, max, moda, tsertaq, tsertor, tsermaq, loncal):
global totalusuarios
global cantgrupos
totalusuarios=0
cantgrupos=0
while True:
size_g = geometric (prob)
if (now()>loncal):
cantgrupos+=1
totalusuarios=totalusuarios+size_g
for j in range (size_g):
c = Customer(name = "Usuario%02d"%(j,))
q = uniform (0,1)
##******************the userr go to the tourniquet-------------------------
if (q<=porc_torniq): #the userr go to the tourniquet
activate(c,c.go_torn(min=min, max=max, moda=moda, tsertor=tsertor)) #el cliente se desplaza
##******************the user walks to buy ticket on the office-------------------------
if (q>porc_torniq and q<=porc_torniq+porc_taq): #user go to ticket station to buy
activate(c,c.go_tickets(min, max, moda, tsertaq=tsertaq, tsertor=tsertor))
##******************the user walks to buy ticket machines-------------------------
if (q>porc_torniq+porc_taq): #user go to machines
activate(c,c.go_machines(min= min, max=max, moda=moda, tsermaq=tsermaq, tsertor=tsertor))
t = expovariate(interarribo) #time between groups of users
yield hold, self, t
class Customer(Process):
def move(self, min, max ,moda):
t1= triangulo(min_, max_, moda_)
yield hold, self,t1
def go_torn(self, min, max ,moda, tsertor):
move(min, max, moda)
yield request, self, torniquete
t2= expovariate(tsertor)
yield hold, self, t2
yield release, self, torniquete
def go_tickets(self, min, max ,moda, tsertaq, tsertor):
move(min, max, moda)
yield request, self, taquilla
t3= expovariate(tsertaq)
yield hold, self, t3
yield release, self, taquilla
go_torn(self, min, max,moda, tsertor)
def go_machines(self, min, max ,moda, tsermaq, tsertor):
move(min, max, moda)
yield request, self, taquilla
t4= expovariate(tsermaq)
yield hold, self, t4
yield release, self, taquilla
go_torn(self, min, max ,moda, tsertor)
## Experiment data ------------------------------
MedGru= 2.0
p= 1/MedGru
TasGru= 5.0
LonCor = 24.0
CanCor= 30
CanTor = 2
CanTaq=2
CanMaq=2
PorTor= 60.0/100.0
PorTaq= 20.0/100.0
PorMaq=20.0/100.0
MinDes= 0.1
MaxDes= 0.2
LonCal= 2.0*60
ModaDes= 0.15
TSerTaq= 1/0.35
TSerTor=1/0.1
TSerMaq= 1/0.5
## Model/Experiment ------------------------------
torniquete = Resource(capacity=CanTor, monitored=True, monitorType= Monitor)
maquina = Resource(capacity=CanMaq, monitored=False)
taquilla = Resource(capacity=CanTaq, monitored=False)
def simulate_():
generador = Generador(name="Grupo")
initialize() #inicializa el reloj de simulacion
activate(generador,generador.generar(p, TasGru,PorTor, PorTaq, PorMaq,
MinDes,MaxDes ,ModaDes, TSerTaq, TSerTor, TSerMaq, LonCal ))
simulate(until=60*LonCor)
for i in range(CanCor):
simulate_()
print "Groups:",cantgrupos, "Users:",totalusuarios
The code consists of 4 functions in the User class scroll, which uses a triangular distribution to simulate displacement of a passenger inside the station, of the entrance to any area (ticket office, machines or tourniquets) and from one area to another, has a
triangularly distributed random term with parameters min, mode and Max minutes.
The time it takes to serve each passenger in a ticket office is a will distributed
TSerTaq exponentially with a half minutes. Each passenger using a vending machine
ticket is occupied during a random time exponentially distributed with mean TSerMaq
minutes. Pass through a turnstile each passenger takes a random time distributed
TSerTor exponentially with a half minutes.
When I try to run my code, it tells me the following message:
C:\Documents and Settings>python llegada.py
Traceback (most recent call last):
File "llegada.py", line 111, in <module>
simulate_()
File "llegada.py", line 109, in simulate_
simulate(until=60*LonCor)
File "C:\Python26\SimPy\Globals.py", line 39, in simulate
return sim.simulate(until = until)
File "C:\Python26\SimPy\Simulation.py", line 689, in simulate
a = nextev()
File "C:\Python26\SimPy\Simulation.py", line 408, in _nextev
resultTuple = nextEvent._nextpoint.next()
File "llegada.py", line 65, in go_tickets
move(min, max, moda)
NameError: global name 'move' is not defined
I do not understand what I'm doing wrong and why I move indicates that the object is not defined. Some help please

You want self.move() not move(). move() would be a top-level function in the module, hence Python's complaint about not finding it as a global name; self.move() is a method on your class instance, which is what you actually have.
All your other method calls need the self. prepended too.

Related

How do you model multiple arrival distributions?

Python:
I am simulating a call-centre with 2 types of incoming calls: Sales calls, and service calls.
These calls have different, independent distributions, which enter the same system.
I have function, arrivals which contains:
iat_sales = random.expovariate(1/3)
yield env.timeout(iat_sales)
I want to incorporate:
iat_service = random.triangular(0,0,6)
yield env.timeout(iat_service)
how can I yield each event simultaneously?
This is the solution I have come up with:
def arrival_list():
sales_time = 0 #sim time of sales arrival
service_time = 0 #sim time of service arrival
sales_list=[] #list of sequential sales arrivals [arrival time,'sales']
service_list=[] #list of sequential sales arrivals [arrival time,'service']
arrivals = [] #ordered list of arrivals (sales and service merged) [arrival time,arrival type,iat]
while sales_time < sim_end:
iat_sales = random.expovariate(sales_rate)
sales_time += iat_sales
sales=[sales_time,'sales']
sales_list.append(sales)
while service_time < sim_end:
iat_service = random.triangular(0,6,0) ####
service_time += iat_service
service=[service_time,'service']
service_list.append(service)
arrivals = sales_list + service_list
arrivals.sort()
arrivals[0].append(arrivals[0][0])
for i in range(len(arrivals)-1):
arrivals[i+1].append(arrivals[i+1][0]-arrivals[i][0])
return arrivals
As a reference, a simple implementation can be done like this, where a simulation is run indefinitely with 1 second intervals and calls are considered to arrive if their random values exceed some thresholds:
import random
import time
def generate_calls():
return random.expovariate(1/3), random.triangular(10, 20, 5)
def simulation(count, sales_acceptance, services_acceptance):
# run the simulation indefinitely
while True:
print('Time: {}'.format(count))
sales_call, services_call = generate_calls()
# calls arrive if the values exceed some thresholds
if sales_call > sales_acceptance:
print('Sales call arrived!')
if services_call > services_acceptance:
print('Services call arrived!')
time.sleep(1)
count += 1
simulation(1, 2, 13)
You can have three separate parallel processes.
1- One process for making Sales calls.
2- One process for making service calls.
3- One process for handling calls.
import simpy
import random
sim_end = 1000;
def generateSalesCall(env, call_pipe):
while env.now < sim_end:
# put call in the pipe
yield call_pipe.put("sales");
interval = random.expovariate(1/3);
yield env.timeout(interval);
def generateServiceCall(env, call_pipe):
while env.now < sim_end:
# put call in the pipe
yield call_pipe.put("service");
interval = random.triangular(0,6,0);
yield env.timeout(interval);
def handleCalls(env, call_pipe):
while(True):
call = yield call_pipe.get();
if call == "sales":
print(env.now, "sales call");
elif call == "service":
print(env.now, "service call");
env = simpy.Environment();
call_pipe = simpy.Store(env);
env.process(generateSalesCall(env, call_pipe));
env.process(generateServiceCall(env, call_pipe));
env.process(handleCalls(env, call_pipe));
env.run();

Python script error sqlite3.OperationalError: no such column:

I get this error when I run the script and I cannot see the solution. This program is supposed to draw a giveaway from a sqlite3 file which has the number of raffle tickets for a user. And recently the program the gives that creates the sqlite3 file updated some stuff (The script is made by me) and I can figure out the solution.
Traceback (most recent call last):
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 244, in <module>
dd = DaveDraw()
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 64, in __init__
self.get_viewers()
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 215, in
get_viewers
''').fetchall()
sqlite3.OperationalError: no such column: viewer_id
there's the code
#!/usr/bin/env python3
import pdb
import random
import sqlite3
class Viewer(object):
def __init__(self,
viewer_id,
twitch_name,
beam_name,
beam_id,
viewer_type,
rank,
points,
points2,
hours,
raids,
gains_currency,
gains_hours,
in_giveaways,
last_seen,
sub,
entrance_message,
entrance_message_type,
entrance_sfx
):
self.viewer_id = viewer_id
self.twitch_name = twitch_name
self.beam_name = beam_name
self.beam_id = beam_id
self.viewer_type = viewer_type
self.rank = rank
self.points = points
self.points2 = points2
self.hours = hours
self.raids = raids
self.gains_currency = gains_currency
self.gains_hours = gains_hours
self.in_giveaways = in_giveaways
self.last_seen = last_seen
self.sub = sub
self.entrance_message = entrance_message
self.entrance_message_type = entrance_message_type
self.entrance_sfx = entrance_sfx
def win_chance(self, total_tickets):
"""
Takes the total tickets (points) as a paramter and works
out the percentage chance that the viewer has of winning.
Returns the viewers win chance in percent.
"""
percent = total_tickets / 100.00
return self.points2 / percent
class DaveDraw(object):
def __init__(self):
self.debug = False
self.database_path = 'Viewers3DB.sqlite'
self.db_conn = sqlite3.connect(self.database_path)
self.get_viewers()
self.calculate_total_points()
self.assign_tickets()
def assign_tickets(self):
"""
Assigns each user a number range based on the number of
tickets they have.
e.g.
10 1-10
10 11-20
30 21-50
1 51
"""
self.tickets = {}
latest_ticket = 0
for viewer in self.viewers:
# skip anyone with no points
if viewer.points2 == 0:
continue
ticket_range_beg = latest_ticket + 1
ticket_range_end = latest_ticket + 1 + viewer.points2
latest_ticket = ticket_range_end
viewer.tickets = range(ticket_range_beg, ticket_range_end)
# assign a range of tickets:
if self.debug:
print("Assigning viewer twitch: %s beam: %s tickets %i-%i" % (viewer.twitch_name, viewer.beam_name, viewer.tickets.start, viewer.tickets.stop))
if ticket_range_beg == ticket_range_end:
if self.debug:
print("Assigning ticket {} to {}".format(ticket_range_beg, viewer.twitch_name))
self.tickets[ticket_range_beg] = viewer
next
for ticket in viewer.tickets:
if self.debug:
print("Assigning ticket {} to {}".format(ticket, viewer.twitch_name))
self.tickets[ticket] = viewer
def calculate_total_points(self):
"""
Gets the total amount of points awarded to all
viewers.
"""
self.total_points = 0
for viewer in self.viewers:
self.total_points += viewer.points2
self.total_points_percent = self.total_points / 100
print("Total points awarded (total tickets): %s" % self.total_points)
def draw(self):
"""
Picks a random number between 1 and total tickets, finds
the user that has been assigned tickets within that range and
returns the user.
"""
ticket = random.randint(1, self.total_points)
try:
winner = self.tickets[ticket]
except:
pdb.set_trace()
print("\n===== WINNER Twitch: {} / Beam: {} =====\n".format(winner.twitch_name, winner.beam_id))
print("Picked ticket {}\n".format(ticket))
print("Winner win chance: {:f}".format(winner.win_chance(self.total_points)))
print("Winner's ticket range: {}-{}".format(winner.tickets.start, winner.tickets.stop))
print("Winner's ticket amount: {}\n".format(winner.points2))
self.display_viewer(winner)
def display_random_viewer(self):
"""
Displays random viewer.
"""
self.display_viewer(self.get_random_viewer())
def display_viewer(self, viewer):
"""
Outputs the data on all viewers.
"""
print("""Viewer ID: %s\nTwitch Name: %s\nBeam Name: %s\nBeam ID: %s\nRank: %s\nPoints: %s\nPoints2: %s\nHours: %s\nRaids: %s\nGains Currency: %s\nGains Hours: %s\nInGiveaways: %s\nLastSeen: %s\nEntrance Message: %s\nEntranceMsgType: %s\nEntranceSFX: %s"""
% (
viewer.viewer_id,
viewer.twitch_name,
viewer.beam_name,
viewer.beam_id,
viewer.rank,
viewer.points,
viewer.points2,
viewer.hours,
viewer.raids,
viewer.gains_currency,
viewer.gains_hours,
viewer.in_giveaways,
viewer.last_seen,
viewer.entrance_message,
viewer.entrance_message_type,
viewer.entrance_sfx
)
)
def get_random_viewer(self):
"""
Gets a completely random viewer.
"""
return random.choice(self.viewers)
def get_viewers(self):
"""
Gets data on all the viewers in the database and stores
the data in self.viewers.
"""
c = self.db_conn.cursor()
viewers = c.execute('''
SELECT
viewer_id,
TwitchName,
BeamName,
BeamID,
Type,
Rank,
Points,
Points2,
Hours,
Raids,
GainsCurrency,
GainsHours,
InGiveaways,
LastSeen,
Sub,
EntranceMessage,
EntranceMsgType,
EntranceSFX
FROM Viewer
WHERE Type != 1
AND TwitchName NOT IN (
\'treeboydave\',
\'treebotdave\'
);
''').fetchall()
self.viewers = []
for cur_viewer in viewers:
self.viewers.append(
Viewer(
cur_viewer[0],
cur_viewer[1],
cur_viewer[2],
cur_viewer[3],
cur_viewer[4],
cur_viewer[5],
cur_viewer[6],
cur_viewer[7],
cur_viewer[8],
cur_viewer[9],
cur_viewer[10],
cur_viewer[11],
cur_viewer[12],
cur_viewer[13],
cur_viewer[14],
cur_viewer[15],
cur_viewer[16],
cur_viewer[17]
)
)
if __name__ == '__main__':
dd = DaveDraw()
dd.draw()
All your other SQL columns are capitalised, any chance that's why it's not finding the viewer_id column? Maybe it's Viewer_Id or similar?
If you sql execute 'HELP TABLE Viewer' and print what it returns, it will give you an outline of all of the columns in that database table, so you can make sure you have the capitalisation correct, or whether the column actually isn't there at all.

Not sure why I'm getting this attribute error

The assertEqual tests are from a module that just calls a function, runs some data through it, computes the result of that processed data, and compares it to my predicted answer. For example, my predicted answer for total_test was 6.0.
When I run my code (below being the troubled part), I get this error:
Traceback (most recent call last):
File "C:/Users/anon/Desktop/test.py", line 72, in <module>
TransactionTest().run()
File "C:/Users/anon/Desktop/test.py", line 68, in run
self.total_test()
File "C:/Users/anon/Desktop/test.py", line 60, in total_test
assertEqual(self.__t1.total(), 6.0)
File "C:/Users/anon/Desktop/test.py", line 18, in total
return sum(map(lambda p: p.cost(), self.__purchases))
File "C:/Users/anon/Desktop/test.py", line 18, in <lambda>
return sum(map(lambda p: p.cost(), self.__purchases))
AttributeError: 'float' object has no attribute 'cost'
All the line numbers should be shifted down a few lines to account for me copy and pasting it here very slightly modified.
Essentially, my total_test function is causing a crash when it's called. Not sure why I'm getting an attribute error.
class Transaction:
def __init__(self, purchases, tax_rate):
self.__purchases = purchases
self.__tax_rate = tax_rate
def total(self):
return sum(map(lambda p: p.cost(), self.__purchases))
def tax_rate(self):
return self.__tax_rate
def total_taxable(self):
taxable_items = filter(lambda p: p.item().taxable(),self.__purchases)
return sum(map(lambda p: p.cost(), taxable_items))
def grand_total(self):
return self.total() + self.__tax_rate * self.total_taxable()
def __str__(self):
return "Total: " + self.__total + ";" + "Total_tax: " + self.__total_taxable * self.__tax_rate + ";" + "Grand Total: " + self.__grand_total
def print_receipt(self):
f = open("receipt.txt", "w")
f.write("\n".join(map(lambda p:str(p),self.__purchases)))
f.write("\n")
f.write("Total: $%.2f" % self.total())
f.write("\n")
f.write("Tax ( $%.2f # %.2f %%): $%.2f" %(self.total_taxable(), self.__tax_rate * 100, self.__tax_rate * self.total_taxable()))
f.write("\n")
f.write("Grand Total: $%.2f" % self.grand_total())
f.close()
#problem 9-----------------------------------------------------------------------------------
class TransactionTest:
print('----------------------------')
def __init__(self):
t_list = [1.0,2.0,3.0]
self.__t1 = Transaction(t_list, 0.05)
#self.__d2 = Transaction(3.0, 0.06)
def total_test(self):
print('total_test-----------------------------------------')
assertEqual(self.__t1.total(), 6.0)
def tax_rate_test(self):
print('tax_rate_test--------------------------------------')
assertEqual(self.__t1.tax_rate(), 0.05)
#assertEqual(self.__d2.tax_rate() = Transaction(0.06))
def run(self):
self.total_test()
#self.tax_rate_test()
#self.str_test()
TransactionTest().run()
Your test code passes a list of three float instances, [1.0,2.0,3.0], as the purchases argument to Transaction's initializer. However, the other Transaction methods try to call various methods (e.g. cost() and item()) on the values from that list, and since float instances don't have the methods an exception is raised.
I suspect your Transaction code is intended to be run on a list of some other kind of object, where the appropriate methods are defined. You need to rewrite your tests to use the right kind of objects.

Python script taking too much memory [duplicate]

I've recently become interested in algorithms and have begun exploring them by writing a naive implementation and then optimizing it in various ways.
I'm already familiar with the standard Python module for profiling runtime (for most things I've found the timeit magic function in IPython to be sufficient), but I'm also interested in memory usage so I can explore those tradeoffs as well (e.g. the cost of caching a table of previously computed values versus recomputing them as needed). Is there a module that will profile the memory usage of a given function for me?
Python 3.4 includes a new module: tracemalloc. It provides detailed statistics about which code is allocating the most memory. Here's an example that displays the top three lines allocating memory.
from collections import Counter
import linecache
import os
import tracemalloc
def display_top(snapshot, key_type='lineno', limit=3):
snapshot = snapshot.filter_traces((
tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
tracemalloc.Filter(False, "<unknown>"),
))
top_stats = snapshot.statistics(key_type)
print("Top %s lines" % limit)
for index, stat in enumerate(top_stats[:limit], 1):
frame = stat.traceback[0]
# replace "/path/to/module/file.py" with "module/file.py"
filename = os.sep.join(frame.filename.split(os.sep)[-2:])
print("#%s: %s:%s: %.1f KiB"
% (index, filename, frame.lineno, stat.size / 1024))
line = linecache.getline(frame.filename, frame.lineno).strip()
if line:
print(' %s' % line)
other = top_stats[limit:]
if other:
size = sum(stat.size for stat in other)
print("%s other: %.1f KiB" % (len(other), size / 1024))
total = sum(stat.size for stat in top_stats)
print("Total allocated size: %.1f KiB" % (total / 1024))
tracemalloc.start()
counts = Counter()
fname = '/usr/share/dict/american-english'
with open(fname) as words:
words = list(words)
for word in words:
prefix = word[:3]
counts[prefix] += 1
print('Top prefixes:', counts.most_common(3))
snapshot = tracemalloc.take_snapshot()
display_top(snapshot)
And here are the results:
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)]
Top 3 lines
#1: scratches/memory_test.py:37: 6527.1 KiB
words = list(words)
#2: scratches/memory_test.py:39: 247.7 KiB
prefix = word[:3]
#3: scratches/memory_test.py:40: 193.0 KiB
counts[prefix] += 1
4 other: 4.3 KiB
Total allocated size: 6972.1 KiB
When is a memory leak not a leak?
That example is great when the memory is still being held at the end of the calculation, but sometimes you have code that allocates a lot of memory and then releases it all. It's not technically a memory leak, but it's using more memory than you think it should. How can you track memory usage when it all gets released? If it's your code, you can probably add some debugging code to take snapshots while it's running. If not, you can start a background thread to monitor memory usage while the main thread runs.
Here's the previous example where the code has all been moved into the count_prefixes() function. When that function returns, all the memory is released. I also added some sleep() calls to simulate a long-running calculation.
from collections import Counter
import linecache
import os
import tracemalloc
from time import sleep
def count_prefixes():
sleep(2) # Start up time.
counts = Counter()
fname = '/usr/share/dict/american-english'
with open(fname) as words:
words = list(words)
for word in words:
prefix = word[:3]
counts[prefix] += 1
sleep(0.0001)
most_common = counts.most_common(3)
sleep(3) # Shut down time.
return most_common
def main():
tracemalloc.start()
most_common = count_prefixes()
print('Top prefixes:', most_common)
snapshot = tracemalloc.take_snapshot()
display_top(snapshot)
def display_top(snapshot, key_type='lineno', limit=3):
snapshot = snapshot.filter_traces((
tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
tracemalloc.Filter(False, "<unknown>"),
))
top_stats = snapshot.statistics(key_type)
print("Top %s lines" % limit)
for index, stat in enumerate(top_stats[:limit], 1):
frame = stat.traceback[0]
# replace "/path/to/module/file.py" with "module/file.py"
filename = os.sep.join(frame.filename.split(os.sep)[-2:])
print("#%s: %s:%s: %.1f KiB"
% (index, filename, frame.lineno, stat.size / 1024))
line = linecache.getline(frame.filename, frame.lineno).strip()
if line:
print(' %s' % line)
other = top_stats[limit:]
if other:
size = sum(stat.size for stat in other)
print("%s other: %.1f KiB" % (len(other), size / 1024))
total = sum(stat.size for stat in top_stats)
print("Total allocated size: %.1f KiB" % (total / 1024))
main()
When I run that version, the memory usage has gone from 6MB down to 4KB, because the function released all its memory when it finished.
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)]
Top 3 lines
#1: collections/__init__.py:537: 0.7 KiB
self.update(*args, **kwds)
#2: collections/__init__.py:555: 0.6 KiB
return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
#3: python3.6/heapq.py:569: 0.5 KiB
result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
10 other: 2.2 KiB
Total allocated size: 4.0 KiB
Now here's a version inspired by another answer that starts a second thread to monitor memory usage.
from collections import Counter
import linecache
import os
import tracemalloc
from datetime import datetime
from queue import Queue, Empty
from resource import getrusage, RUSAGE_SELF
from threading import Thread
from time import sleep
def memory_monitor(command_queue: Queue, poll_interval=1):
tracemalloc.start()
old_max = 0
snapshot = None
while True:
try:
command_queue.get(timeout=poll_interval)
if snapshot is not None:
print(datetime.now())
display_top(snapshot)
return
except Empty:
max_rss = getrusage(RUSAGE_SELF).ru_maxrss
if max_rss > old_max:
old_max = max_rss
snapshot = tracemalloc.take_snapshot()
print(datetime.now(), 'max RSS', max_rss)
def count_prefixes():
sleep(2) # Start up time.
counts = Counter()
fname = '/usr/share/dict/american-english'
with open(fname) as words:
words = list(words)
for word in words:
prefix = word[:3]
counts[prefix] += 1
sleep(0.0001)
most_common = counts.most_common(3)
sleep(3) # Shut down time.
return most_common
def main():
queue = Queue()
poll_interval = 0.1
monitor_thread = Thread(target=memory_monitor, args=(queue, poll_interval))
monitor_thread.start()
try:
most_common = count_prefixes()
print('Top prefixes:', most_common)
finally:
queue.put('stop')
monitor_thread.join()
def display_top(snapshot, key_type='lineno', limit=3):
snapshot = snapshot.filter_traces((
tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
tracemalloc.Filter(False, "<unknown>"),
))
top_stats = snapshot.statistics(key_type)
print("Top %s lines" % limit)
for index, stat in enumerate(top_stats[:limit], 1):
frame = stat.traceback[0]
# replace "/path/to/module/file.py" with "module/file.py"
filename = os.sep.join(frame.filename.split(os.sep)[-2:])
print("#%s: %s:%s: %.1f KiB"
% (index, filename, frame.lineno, stat.size / 1024))
line = linecache.getline(frame.filename, frame.lineno).strip()
if line:
print(' %s' % line)
other = top_stats[limit:]
if other:
size = sum(stat.size for stat in other)
print("%s other: %.1f KiB" % (len(other), size / 1024))
total = sum(stat.size for stat in top_stats)
print("Total allocated size: %.1f KiB" % (total / 1024))
main()
The resource module lets you check the current memory usage, and save the snapshot from the peak memory usage. The queue lets the main thread tell the memory monitor thread when to print its report and shut down. When it runs, it shows the memory being used by the list() call:
2018-05-29 10:34:34.441334 max RSS 10188
2018-05-29 10:34:36.475707 max RSS 23588
2018-05-29 10:34:36.616524 max RSS 38104
2018-05-29 10:34:36.772978 max RSS 45924
2018-05-29 10:34:36.929688 max RSS 46824
2018-05-29 10:34:37.087554 max RSS 46852
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)]
2018-05-29 10:34:56.281262
Top 3 lines
#1: scratches/scratch.py:36: 6527.0 KiB
words = list(words)
#2: scratches/scratch.py:38: 16.4 KiB
prefix = word[:3]
#3: scratches/scratch.py:39: 10.1 KiB
counts[prefix] += 1
19 other: 10.8 KiB
Total allocated size: 6564.3 KiB
If you're on Linux, you may find /proc/self/statm more useful than the resource module.
This one has been answered already here: Python memory profiler
Basically you do something like that (cited from Guppy-PE):
>>> from guppy import hpy; h=hpy()
>>> h.heap()
Partition of a set of 48477 objects. Total size = 3265516 bytes.
Index Count % Size % Cumulative % Kind (class / dict of class)
0 25773 53 1612820 49 1612820 49 str
1 11699 24 483960 15 2096780 64 tuple
2 174 0 241584 7 2338364 72 dict of module
3 3478 7 222592 7 2560956 78 types.CodeType
4 3296 7 184576 6 2745532 84 function
5 401 1 175112 5 2920644 89 dict of class
6 108 0 81888 3 3002532 92 dict (no owner)
7 114 0 79632 2 3082164 94 dict of type
8 117 0 51336 2 3133500 96 type
9 667 1 24012 1 3157512 97 __builtin__.wrapper_descriptor
<76 more rows. Type e.g. '_.more' to view.>
>>> h.iso(1,[],{})
Partition of a set of 3 objects. Total size = 176 bytes.
Index Count % Size % Cumulative % Kind (class / dict of class)
0 1 33 136 77 136 77 dict (no owner)
1 1 33 28 16 164 93 list
2 1 33 12 7 176 100 int
>>> x=[]
>>> h.iso(x).sp
0: h.Root.i0_modules['__main__'].__dict__['x']
>>>
If you only want to look at the memory usage of an object, (answer to other question)
There is a module called Pympler which contains the asizeof
module.
Use as follows:
from pympler import asizeof
asizeof.asizeof(my_object)
Unlike sys.getsizeof, it works for your self-created objects.
>>> asizeof.asizeof(tuple('bcd'))
200
>>> asizeof.asizeof({'foo': 'bar', 'baz': 'bar'})
400
>>> asizeof.asizeof({})
280
>>> asizeof.asizeof({'foo':'bar'})
360
>>> asizeof.asizeof('foo')
40
>>> asizeof.asizeof(Bar())
352
>>> asizeof.asizeof(Bar().__dict__)
280
>>> help(asizeof.asizeof)
Help on function asizeof in module pympler.asizeof:
asizeof(*objs, **opts)
Return the combined size in bytes of all objects passed as positional arguments.
Disclosure:
Applicable on Linux only
Reports memory used by the current process as a whole, not individual functions within
But nice because of its simplicity:
import resource
def using(point=""):
usage=resource.getrusage(resource.RUSAGE_SELF)
return '''%s: usertime=%s systime=%s mem=%s mb
'''%(point,usage[0],usage[1],
usage[2]/1024.0 )
Just insert using("Label") where you want to see what's going on. For example
print(using("before"))
wrk = ["wasting mem"] * 1000000
print(using("after"))
>>> before: usertime=2.117053 systime=1.703466 mem=53.97265625 mb
>>> after: usertime=2.12023 systime=1.70708 mem=60.8828125 mb
Below is a simple function decorator which allows to track how much memory the process consumed before the function call, after the function call, and what is the difference:
import time
import os
import psutil
def elapsed_since(start):
return time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
def get_process_memory():
process = psutil.Process(os.getpid())
mem_info = process.memory_info()
return mem_info.rss
def profile(func):
def wrapper(*args, **kwargs):
mem_before = get_process_memory()
start = time.time()
result = func(*args, **kwargs)
elapsed_time = elapsed_since(start)
mem_after = get_process_memory()
print("{}: memory before: {:,}, after: {:,}, consumed: {:,}; exec time: {}".format(
func.__name__,
mem_before, mem_after, mem_after - mem_before,
elapsed_time))
return result
return wrapper
Here is my blog which describes all the details. (archived link)
Since the accepted answer and also the next highest voted answer have, in my opinion, some problems, I'd like to offer one more answer that is based closely on Ihor B.'s answer with some small but important modifications.
This solution allows you to run profiling on either by wrapping a function call with the profile function and calling it, or by decorating your function/method with the #profile decorator.
The first technique is useful when you want to profile some third-party code without messing with its source, whereas the second technique is a bit "cleaner" and works better when you are don't mind modifying the source of the function/method you want to profile.
I've also modified the output, so that you get RSS, VMS, and shared memory. I don't care much about the "before" and "after" values, but only the delta, so I removed those (if you're comparing to Ihor B.'s answer).
Profiling code
# profile.py
import time
import os
import psutil
import inspect
def elapsed_since(start):
#return time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
elapsed = time.time() - start
if elapsed < 1:
return str(round(elapsed*1000,2)) + "ms"
if elapsed < 60:
return str(round(elapsed, 2)) + "s"
if elapsed < 3600:
return str(round(elapsed/60, 2)) + "min"
else:
return str(round(elapsed / 3600, 2)) + "hrs"
def get_process_memory():
process = psutil.Process(os.getpid())
mi = process.memory_info()
return mi.rss, mi.vms, mi.shared
def format_bytes(bytes):
if abs(bytes) < 1000:
return str(bytes)+"B"
elif abs(bytes) < 1e6:
return str(round(bytes/1e3,2)) + "kB"
elif abs(bytes) < 1e9:
return str(round(bytes / 1e6, 2)) + "MB"
else:
return str(round(bytes / 1e9, 2)) + "GB"
def profile(func, *args, **kwargs):
def wrapper(*args, **kwargs):
rss_before, vms_before, shared_before = get_process_memory()
start = time.time()
result = func(*args, **kwargs)
elapsed_time = elapsed_since(start)
rss_after, vms_after, shared_after = get_process_memory()
print("Profiling: {:>20} RSS: {:>8} | VMS: {:>8} | SHR {"
":>8} | time: {:>8}"
.format("<" + func.__name__ + ">",
format_bytes(rss_after - rss_before),
format_bytes(vms_after - vms_before),
format_bytes(shared_after - shared_before),
elapsed_time))
return result
if inspect.isfunction(func):
return wrapper
elif inspect.ismethod(func):
return wrapper(*args,**kwargs)
Example usage, assuming the above code is saved as profile.py:
from profile import profile
from time import sleep
from sklearn import datasets # Just an example of 3rd party function call
# Method 1
run_profiling = profile(datasets.load_digits)
data = run_profiling()
# Method 2
#profile
def my_function():
# do some stuff
a_list = []
for i in range(1,100000):
a_list.append(i)
return a_list
res = my_function()
This should result in output similar to the below:
Profiling: <load_digits> RSS: 5.07MB | VMS: 4.91MB | SHR 73.73kB | time: 89.99ms
Profiling: <my_function> RSS: 1.06MB | VMS: 1.35MB | SHR 0B | time: 8.43ms
A couple of important final notes:
Keep in mind, this method of profiling is only going to be approximate, since lots of other stuff might be happening on the machine. Due to garbage collection and other factors, the deltas might even be zero.
For some unknown reason, very short function calls (e.g. 1 or 2 ms)
show up with zero memory usage. I suspect this is some limitation of
the hardware/OS (tested on basic laptop with Linux) on how often
memory statistics are updated.
To keep the examples simple, I didn't use any function arguments, but they should work as one would expect, i.e.
profile(my_function, arg) to profile my_function(arg)
A simple example to calculate the memory usage of a block of codes / function using memory_profile, while returning result of the function:
import memory_profiler as mp
def fun(n):
tmp = []
for i in range(n):
tmp.extend(list(range(i*i)))
return "XXXXX"
calculate memory usage before running the code then calculate max usage during the code:
start_mem = mp.memory_usage(max_usage=True)
res = mp.memory_usage(proc=(fun, [100]), max_usage=True, retval=True)
print('start mem', start_mem)
print('max mem', res[0][0])
print('used mem', res[0][0]-start_mem)
print('fun output', res[1])
calculate usage in sampling points while running function:
res = mp.memory_usage((fun, [100]), interval=.001, retval=True)
print('min mem', min(res[0]))
print('max mem', max(res[0]))
print('used mem', max(res[0])-min(res[0]))
print('fun output', res[1])
Credits: #skeept
maybe it help:
<see additional>
pip install gprof2dot
sudo apt-get install graphviz
gprof2dot -f pstats profile_for_func1_001 | dot -Tpng -o profile.png
def profileit(name):
"""
#profileit("profile_for_func1_001")
"""
def inner(func):
def wrapper(*args, **kwargs):
prof = cProfile.Profile()
retval = prof.runcall(func, *args, **kwargs)
# Note use of name from outer scope
prof.dump_stats(name)
return retval
return wrapper
return inner
#profileit("profile_for_func1_001")
def func1(...)

Optimization of python multithreading script - huge memory consumption

I have a script (Django Management-Command) wiht over 800 lines of code.
This should import data from a external Web-Service, manipulate sth. and write it to a Postgres DB.
I use multithreading, because fetching data from webservice ist not very fast.
There ist one Thread for fetching the data with a bulk command to get a bulk of 64 data sets an write each data set in a queue.
Simultaneously at the beginning there is one worker-thread wich manipulates the data and write it to a DB.
In the main (handle) class, there is a while-loop that looks every 5 seconds for the quantity of elements in the queue and the quantity of running worker-threads.
If there are more than 500 elements in the queue and there are less then 5 worker-threads, it starts a new worker-thread.
All worker-threads get one item from the queue, manipulate sth., write the data set to the DB and append one String (up to 14 chars) to a different queue (#2).
The queue #2 ist necessary to have all imported objects at the end of the import to mark them as new respectively delete all other items from the DB, which are currently not imported.
For DB's with a quantity of not more then 200.000 data sets everything works fine.
But if there is for example a DB with 1.000.000 data sets, the memory consumption increases during the processing of the hole script up to 8 GB of RAM.
Is there a method to watch the memory consumption of threads and / or queue's?
Is there a method to "clean" memory after each while-loop?
# -*- coding: utf-8 -*-
import os
import threading
import Queue
import time
from optparse import OptionParser, make_option
from decimal import Decimal
from datetime import datetime
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.conf import settings
def is_someone_alive(thread_list):
so_alive = False
for t in thread_list:
if t.is_alive():
so_alive = True
return so_alive
class insert_item(threading.Thread):
VarLock2 = threading.Lock()
def __init__(self, queue1, item_still_exist2, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
self.name = name
self.queue1 = queue1
self.item_still_exist2 = item_still_exist2
def run(self):
while not self.queue1.empty() or getItemBulkThread.isrunning:
item = self.queue1.get()
artikelobj, created = Artikel.objects.get_or_create(artikelnr=item['Nr'])
"""
manipulate data
"""
self.item_still_exist2.put(artikelobj.artikelnr)
artikelobj.save()
self.queue1.task_done()
class getItemBulkThread(threading.Thread):
isrunning = True
VarLock = threading.Lock()
def __init__(self, queue1, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
if self.options['nrStart'] != '':
self.nrab = self.options['nrStart']
else:
self.nrab = ''
self.name = name
#self.nrab = '701307'
self.queue1 = queue1
self.anz_artikel = 64
self.max_artikel = 64
self.skipped = 0
self.max_skip = 20
def run(self):
count_sleep = 0
while True:
while self.queue1.qsize() > 5000:
time.sleep(5)
count_sleep += 1
if count_sleep > 0:
print "~ Artikel-Import %(csleep)sx für 5s pausiert, da Queue-Size > 5000" % {'csleep': count_sleep}
count_sleep = 0
try:
items = getItemBulk() # from external service
except Exception as exc1:
if ('"normal" abort-condition' in str(exc1)):
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
break
elif self.anz_artikel > 1:
self.anz_artikel /= 2
continue
elif self.skipped <= self.max_skip:
self.nrab += 1
self.skipped += 1
time.sleep(5)
continue
elif self.skipped > self.max_skip:
raise Exception("[EXCEPTION] Fehler im Thread: too much items skipped")
else:
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
raise
last_item = len(items) - 1
self.nrab = items[last_item]['Nr']
for artikel in items:
artikel['katItem'] = False
self.queue1.put(artikel)
if self.anz_artikel < self.max_artikel:
self.anz_artikel *= 2
self.skipped = 0
class Command(BaseCommand):
"""
Django-mgm-command
"""
help = u'Import'
def create_parser(self, prog_name, subcommand):
"""
Create and return the ``OptionParser`` which will be used to
parse the arguments to this command.
"""
return OptionParser(prog=prog_name, usage=self.usage(subcommand),
version=self.get_version(),
option_list=self.option_list,
conflict_handler="resolve")
def handle(self, *args, **options):
startzeit = datetime.now()
anzahl_Artikel_vorher = Artikel.objects.all().count() # Artikel is a model
self.options = options
items_vorher = []
queue1 = Queue.Queue()
item_still_exists2 = Queue.Queue()
running_threads = []
thread = getItemBulkThread(queue1, name="Artikel", *args, **options)
running_threads.append(thread)
thread.daemon = True
thread.start()
anz_worker_threads = 1
anz_max_worker_threads = 5
insert_threads = [insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': i + 1}, *args, **options) for i in range(anz_worker_threads)]
for thread in insert_threads:
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
add_seconds = 5
element_grenze = 500
lastelemente = 0
asc_elemente = 0
anz_abgearbeitet = 0
while getItemBulkThread.isrunning or not queue1.empty():
time.sleep(add_seconds)
elemente = queue1.qsize()
akt_zeit = datetime.now()
diff_zeit = akt_zeit - startzeit
diff = elemente - lastelemente
anz_abgearbeitet = item_still_exists2.qsize()
art_speed = (anz_abgearbeitet / timedelta_total_seconds(diff_zeit)) * 60
ersetz_var = {'anz': elemente, 'zeit': diff_zeit, 'tstamp': akt_zeit.strftime('%Y.%m.%d-%H:%M:%S'), 'anzw': anz_worker_threads, 'diff': diff, 'anza': anz_abgearbeitet, 'art_speed': art_speed}
print("%(zeit)s vergangen - %(tstamp)s - %(anz)s Elemente in Queue, Veränderung: %(diff)s - Anz Worker: %(anzw)s - Artikel importiert: %(anza)s - Speed: %(art_speed)02d Art/Min" % ersetz_var)
if diff > 0:
asc_elemente += 1
else:
asc_elemente = 0
if asc_elemente > 2 and anz_worker_threads < anz_max_worker_threads and elemente > element_grenze:
ersetz_var = {'maxw': anz_max_worker_threads, 'nr': anz_worker_threads + 1, 'element_grenze': element_grenze}
print "~~ 2x in Folge mehr Queue-Elemente als vorher, die max. Anzahl an Workern %(maxw)s noch nicht erreicht und mehr als %(element_grenze)s Elemente in der Queue, daher Start eines neuen Workers (Nr %(nr)s)" % ersetz_var
anz_worker_threads += 1
thread = insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': anz_worker_threads}, *args, **options)
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
asc_elemente = 0
lastelemente = elemente
queue1.join()
items_nachher = []
while not item_still_exists2.empty():
item = item_still_exists2.get()
if item in items_vorher:
items_nachher.append(item)
items_vorher.remove(item)
item_still_exists2.task_done()
item_still_exists2.join()
if len(items_vorher) > 0:
Artikel.objects.filter(artikelnr__in=items_vorher).delete()
anzahl_Artikel_nachher = Artikel.objects.all().count()
anzahl_Artikel_diff = anzahl_Artikel_nachher - anzahl_Artikel_vorher
endzeit = datetime.now()
dauer = endzeit - startzeit
I've abbreviated the Code at some positions :)
A possible cause for excessive memory consumption is that you don't set a maximum size for the input queue. See the maxsize parameter.
On a related note, you write:
In the main (handle) class, there is a while-loop that looks every 5
seconds for the quantity of elements in the queue and the quantity of
running worker-threads. If there are more than 500 elements in the
queue and there are less then 5 worker-threads, it starts a new
worker-thread.
Creating a new thread does not necessarily increase the throughput. You should rather do some tests to determine the optimal number of threads, which may turn out to be 1.

Categories

Resources