I'm doing a packet injection with scapy and creating a threading.Timer which deletes the packet information from the dictionary after 10 seconds.
If I receive a response before 10 seconds, I'm cancelling the Timer and deleting the packet information from the dictionary. Since I'm cancelling the Timer, I can put .join() here. But when the Timer expires, there's no mechanism to .join().
When I run the program, the memory keeps on increasing. No doubt, increasing is slow (Initially 2% in a 1 GB RAM system. In 20 minutes, it went to 3.9%). But still it keeps on increasing. I tried gc.collect(). But it's of no help. I'm monitoring the memory with top.
Below is the code. Sorry for the large code. I guess it's better to give the whole code to know if somewhere I'm missing something.
from threading import Timer
from scapy.all import *
import gc
class ProcessPacket():
#To write the packets to the response.pcap file
def pcapWrite(self, pkts):
writerResponse(pkts)
writerResponse.flush()
return
#cancels the Timer upon receiving a response
def timerCancel(self, ipPort):
if self.pktInfo[ipPort]["timer"].isAlive():
self.pktInfo[ipPort]["timer"].cancel()
self.pktInfo[ipPort]["timer"].join()
self.delete(ipPort)
return
#deletes the packet information from pktInfo
def delete(self, ipPort):
if self.pktInfo.has_key(ipPort):
self.pcapWrite(self.pktInfo[ipPort]["packets"])
del self.pktInfo[ipPort]
return
#processes the received packet and sends the response
def createSend(self, pkt, ipPort):
self.pktInfo[ipPort]["packets"] = [pkt]
self.pktInfo[ipPort]["timer"] = Timer(10, self.delete, args=[ipPort])
myPkt = IP(src = pkt[IP].dst, dst = pkt[IP].src)/ TCP(dport = pkt[TCP].sport, sport = pkt[TCP].dport, flags = 'SA')
self.pktInfo[ipPort]["packets"].append(myPkt)
send(myPkt, verbose=0)
self.pktInfo[ipPort]["timer"].start()
return
#constructor
def __init__(self):
self.count = 0
self.writerResponse=PcapWriter('response.pcap',append = True)
self.pktInfo = {}
return
#from sniff
def pktCallback(self,pkt):
ipPort = pkt[IP].src + str(pkt[TCP].sport) + pkt[IP].dst + str(pkt[TCP].dport)
flag=pkt.sprintf('%TCP.flags%')
if self.count == 10:
self.count = 0
gc.collect()
if not self.pktInfo.has_key(ipPort) and flag == 'S':
self.pktInfo[ipPort] = {}
self.createSend(pkt, ipPort)
self.count += 1
elif self.pktInfo.has_key(ipPort):
self.timerCancel(ipPort)
self.count += 1
return
#custom filter for sniff
def myFilter(pkt):
if pkt.haslayer(IP):
if pkt[IP].src == "172.16.0.1":
return 1
return 0
if __name__ == '__main__':
respondObj = ProcessPacket()
sniff(iface = 'eth0', filter = "tcp", prn = respondObj.pktCallback, lfilter = myFilter)
In the program, I don't see any other memory consuming factor other than pktInfo and the Timer. pktInfo is increasing and decreasing so the problem is with Timer. How can I free memory of the expired or cancelled Timers?
EDIT 1:
I modified the delete() function:
#deletes the packet information from pktInfo
def delete(self, ipPort):
if self.pktInfo.has_key(ipPort):
print "Before", len(self.pktinfo.keys()), sys.getsizeof(self.pktinfo)
self.pcapWrite(self.pktInfo[ipPort]["packets"])
self.pktInfo[ipPort]["timer"]=None
self.pktInfo[ipPort]=None
del self.pktInfo[ipPort]
gc.collect()
print "After", len(self.pktinfo.keys()), sys.getsizeof(self.pktinfo)
return
After deletion the number of elements in self.pktinfo decreases. The size of the self.pktinfo remains same for a long time but eventually changes (decreases or increases). But the memory of system doesn't seem to be released. top shows the same behaviour that the memory used by the program is continuously increasing.
Release all references to unneeded Timer objects. (e.g. self.pktInfo[ipPort]["timer"] = None) - if you don't the garbage collector won't free up anything.
Related
I have a list of ~300K URLs for an API i need to get data from.
The API limit is 100 calls per second.
I have made a class for the asynchronous but this is working to fast and I am hitting an error on the API.
How do I slow down the asynchronous, so that I can make 100 calls per second?
import grequests
lst = ['url.com','url2.com']
class Test:
def __init__(self):
self.urls = lst
def exception(self, request, exception):
print ("Problem: {}: {}".format(request.url, exception))
def async(self):
return grequests.map((grequests.get(u) for u in self.urls), exception_handler=self.exception, size=5)
def collate_responses(self, results):
return [x.text for x in results]
test = Test()
#here we collect the results returned by the async function
results = test.async()
response_text = test.collate_responses(results)
The first step that I took was to create an object who can distribute a maximum of n coins every t ms.
import time
class CoinsDistribution:
"""Object that distribute a maximum of maxCoins every timeLimit ms"""
def __init__(self, maxCoins, timeLimit):
self.maxCoins = maxCoins
self.timeLimit = timeLimit
self.coin = maxCoins
self.time = time.perf_counter()
def getCoin(self):
if self.coin <= 0 and not self.restock():
return False
self.coin -= 1
return True
def restock(self):
t = time.perf_counter()
if (t - self.time) * 1000 < self.timeLimit:
return False
self.coin = self.maxCoins
self.time = t
return True
Now we need a way of forcing function to only get called if they can get a coin.
To do that we can write a decorator function that we could use like that:
#limitCalls(callLimit=1, timeLimit=1000)
def uniqFunctionRequestingServer1():
return 'response from s1'
But sometimes, multiple functions are calling requesting the same server so we would want them to get coins from the the same CoinsDistribution object.
Therefor, another use of the decorator would be by supplying the CoinsDistribution object:
server_2_limit = CoinsDistribution(3, 1000)
#limitCalls(server_2_limit)
def sendRequestToServer2():
return 'it worked !!'
#limitCalls(server_2_limit)
def sendAnOtherRequestToServer2():
return 'it worked too !!'
We now have to create the decorator, it can take either a CoinsDistribution object or enough data to create a new one.
import functools
def limitCalls(obj=None, *, callLimit=100, timeLimit=1000):
if obj is None:
obj = CoinsDistribution(callLimit, timeLimit)
def limit_decorator(func):
#functools.wraps(func)
def limit_wrapper(*args, **kwargs):
if obj.getCoin():
return func(*args, **kwargs)
return 'limit reached, please wait'
return limit_wrapper
return limit_decorator
And it's done ! Now you can limit the number of calls any API that you use and you can build a dictionary to keep track of your CoinsDistribution objects if you have to manage a lot of them (to differrent API endpoints or to different APIs).
Note: Here I have choosen to return an error message if there are no coins available. You should adapt this behaviour to your needs.
You can just keep track of how much time has passed and decide if you want to do more requests or not.
This will print 100 numbers per second, for example:
from datetime import datetime
import time
start = datetime.now()
time.sleep(1);
counter = 0
while (True):
end = datetime.now()
s = (end-start).seconds
if (counter >= 100):
if (s <= 1):
time.sleep(1) # You can keep track of the time and sleep less, actually
start = datetime.now()
counter = 0
print(counter)
counter += 1
This other question in SO shows exactly how to do this. By the way, what you need is usually called throttling.
I am testing an Ant Colony Optimisation (ACO) software which runs with multiple threads (1 for each ant created).
Each ACO iteration should wait for all threads to finish, before allowing the next iteration to start. I am doing this with "condition()" from threading module.
Since ants share a pherormone matrix, the reading and writing on that matrix is subject to locks, also from the threading module.
Now a description of the problem:
I run the function and print something at each iteration. Sometimes, not always, it seams that the execution of the function stops, this is, it stops printing, meaning the iteration never finished.
I honestly don't now why this is happening, and i would appreciate any answer that could get me in the right track. If i had to guess I would say that the condition variable is not properly called, or something like that. However I am not sure, and I also find it odd that this only happens sometimes.
Below are the relevant functions.
The ACO starts by calling the start() function. This creates N threads, which, when finished, call update(). This update function, upon being called N times, calls notify, which allows start() to continue the process and, finally, start the next iteration. I also posted the run method of each thread.
It may be worth mentioning that, without daemon actions, the error hardly occurs. With daemon actions, it occurs almost always (which i also find odd).
Finally, the error does not happen always in the same iteration.
def start(self):
self.ants = self.create_ants()
self.iter_counter = 0
while self.iter_counter < self.num_iterations:
print "START ACQUIRED"
self.cv.acquire()
print "calling iteration"
self.iteration()
#CV wait until all ants (threads) finish and call update, which
#calls notify(), and allow continuation
while not self.iter_done:
print "iter not complete, W8ING"
self.cv.wait()
print "global update "
self.global_update_with_lock()
print "START RELEASED"
self.cv.release()
def update(self, ant):
lock = Lock()
lock.acquire()
print "Update called by %s" % (ant.ID,)
self.ant_counter += 1
self.avg_path_cost += ant.path_cost
# book-keeping
if ant.path_cost < self.best_path_cost:
self.best_path_cost = ant.path_cost
self.best_path_mat = ant.path_mat
self.best_path_vec = ant.path_vec
self.last_best_path_iteration = self.iter_counter
#all threads finished, call notify
print "ant counter"
print self.ant_counter
if self.ant_counter == len(self.ants):
print "ants finished"
#THIS MIGHT CAUSE PROBLEMS (no need to notify if its no one waiting)
self.best_cost_at_iter.append(self.best_path_cost)
self.avg_path_cost /= len(self.ants)
self.cv.acquire()
self.iter_done = True
self.cv.notify()
self.cv.release()
lock.release()
# overide Thread's run()
def run(self):
graph = self.colony.graph
while not self.end():
# we need exclusive access to the graph
graph.lock.acquire()
new_node = self.state_transition_rule(self.curr_node)
self.path_cost += graph.delta(self.curr_node, new_node)
self.path_vec.append(new_node)
self.path_mat[self.curr_node][new_node] = 1 #adjacency matrix representing path
#print "Ant %s : %s, %s" % (self.ID, self.path_vec, self.path_cost,)
self.local_updating_rule(self.curr_node, new_node)
graph.lock.release()
self.curr_node = new_node
# close the tour
self.path_vec.append(self.path_vec[0])
#RUN LOCAL HEURISTIC
if self.daemon == True:
try:
daemon_result = twoOpt(self.path_vec, graph.delta_mat)
d_path, d_adj = daemon_result['path_vec'], daemon_result['path_matrix']
self.path_vec = d_path
self.path_mat = d_adj
except Exception, e:
print "exception: " + str(e)
traceback.print_exc()
self.path_cost += graph.delta(self.path_vec[-2], self.path_vec[-1])
# send our results to the colony
self.colony.update(self)
#print "Ant thread %s terminating." % (self.ID,)
# allows thread to be restarted (calls Thread.__init__)
self.__init__(self.ID, self.start_node, self.colony, self.daemon, self.Beta, self.Q0, self.Rho)
Solution to the problem:
First of all, i corrected the error in the waiting of the condition variables, in according to the comments here.
Second, it was still hanging sometimes, and this was due to a somewhat buggy mistake in the thread counter update.
The solution was to change the counter from an int, to an array with length num_threads, full of 0's, where each thread updates its position in the list. When all threads finish, the counter array should be all 1's. This is currently working just fine.
I've seen several posts about this, so I know it is fairly straightforward to do, but I seem to be coming up short. I'm not sure if I need to create a worker pool, or use the Queue class. Basically, I want to be able to create several processes that each act autonomously (which is why they inherit from the Agent superclass).
At random ticks of my main loop I want to update each Agent. I'm using time.sleep with different values in the main loop and the Agent's run loop to simulate different processor speeds.
Here is my Agent superclass:
# Generic class to handle mpc of each agent
class Agent(mpc.Process):
# initialize agent parameters
def __init__(self,):
# init mpc
mpc.Process.__init__(self)
self.exit = mpc.Event()
# an agent's main loop...generally should be overridden
def run(self):
while not self.exit.is_set():
pass
print "You exited!"
# safely shutdown an agent
def shutdown(self):
print "Shutdown initiated"
self.exit.set()
# safely communicate values to this agent
def communicate(self,value):
print value
A specific agent's subclass (simulating an HVAC system):
class HVAC(Agent):
def __init__(self, dt=70, dh=50.0):
super(Agent, self).__init__()
self.exit = mpc.Event()
self.__pref_heating = True
self.__pref_cooling = True
self.__desired_temperature = dt
self.__desired_humidity = dh
self.__meas_temperature = 0
self.__meas_humidity = 0.0
self.__hvac_status = "" # heating, cooling, off
self.start()
def run(self): # handle AC or heater on
while not self.exit.is_set():
ctemp = self.measureTemp()
chum = self.measureHumidity()
if (ctemp < self.__desired_temperature):
self.__hvac_status = 'heating'
self.__meas_temperature += 1
elif (ctemp > self.__desired_temperature):
self.__hvac_status = 'cooling'
self.__meas_temperature += 1
else:
self.__hvac_status = 'off'
print self.__hvac_status, self.__meas_temperature
time.sleep(0.5)
print "HVAC EXITED"
def measureTemp(self):
return self.__meas_temperature
def measureHumidity(self):
return self.__meas_humidity
def communicate(self,updates):
self.__meas_temperature = updates['temp']
self.__meas_humidity = updates['humidity']
print "Measured [%d] [%f]" % (self.__meas_temperature,self.__meas_humidity)
And my main loop:
if __name__ == "__main__":
print "Initializing subsystems"
agents = {}
agents['HVAC'] = HVAC()
# Run simulation
timestep = 0
while timestep < args.timesteps:
print "Timestep %d" % timestep
if timestep % 10 == 0:
curr_temp = random.randrange(68,72)
curr_humidity = random.uniform(40.0,60.0)
agents['HVAC'].communicate({'temp':curr_temp, 'humidity':curr_humidity})
time.sleep(1)
timestep += 1
agents['HVAC'].shutdown()
print "HVAC process state: %d" % agents['HVAC'].is_alive()
So the issue is that, whenever I run agents['HVAC'].communicate(x) within the main loop, I can see the value being passed into the HVAC subclass in its run loop (so it prints the received value correctly). However, the value never is successfully stored.
So typical output looks like this:
Initializing subsystems
Timestep 0
Measured [68] [56.948675]
heating 1
heating 2
Timestep 1
heating 3
heating 4
Timestep 2
heating 5
heating 6
When in reality, as soon as Measured [68] appears, the internal stored value should be updated to output 68 (not heating 1, heating 2, etc.). So effectively, the HVAC's self.__meas_temperature is not being properly updated.
Edit: After a bit of research, I realized that I didn't necessarily understand what is happening behind the scenes. Each subprocess operates with its own virtual chunk of memory and is completely abstracted away from any data being shared this way, so passing the value in isn't going to work. My new issue is that I'm not necessarily sure how to share a global value with multiple processes.
I was looking at the Queue or JoinableQueue packages, but I'm not necessarily sure how to pass a Queue into the type of superclass setup that I have (especially with the mpc.Process.__init__(self) call).
A side concern would be if I can have multiple agents reading values out of the queue without pulling it out of the queue? For instance, if I wanted to share a temperature value with multiple agents, would a Queue work for this?
Pipe v Queue
Here's a suggested solution assuming that you want the following:
a centralized manager / main process which controls lifetimes of the workers
worker processes to do something self-contained and then report results to the manager and other processes
Before I show it though, for the record I want to say that in general unless you are CPU bound multiprocessing is not really the right fit, mainly because of the added complexity, and you'd probably be better of using a different high-level asynchronous framework. Also, you should use python 3, it's so much better!
That said, multiprocessing.Manager, makes this pretty easy to do using multiprocessing. I've done this in python 3 but I don't think anything shouldn't "just work" in python 2, but I haven't checked.
from ctypes import c_bool
from multiprocessing import Manager, Process, Array, Value
from pprint import pprint
from time import sleep, time
class Agent(Process):
def __init__(self, name, shared_dictionary, delay=0.5):
"""My take on your Agent.
Key difference is that I've commonized the run-loop and used
a shared value to signal when to stop, to demonstrate it.
"""
super(Agent, self).__init__()
self.name = name
# This is going to be how we communicate between processes.
self.shared_dictionary = shared_dictionary
# Create a silo for us to use.
shared_dictionary[name] = []
self.should_stop = Value(c_bool, False)
# Primarily for testing purposes, and for simulating
# slower agents.
self.delay = delay
def get_next_results(self):
# In the real world I'd use abc.ABCMeta as the metaclass to do
# this properly.
raise RuntimeError('Subclasses must implement this')
def run(self):
ii = 0
while not self.should_stop.value:
ii += 1
# debugging / monitoring
print('%s %s run loop execution %d' % (
type(self).__name__, self.name, ii))
next_results = self.get_next_results()
# Add the results, along with a timestamp.
self.shared_dictionary[self.name] += [(time(), next_results)]
sleep(self.delay)
def stop(self):
self.should_stop.value = True
print('%s %s stopped' % (type(self).__name__, self.name))
class HVACAgent(Agent):
def get_next_results(self):
# This is where you do your work, but for the sake of
# the example just return a constant dictionary.
return {'temperature': 5, 'pressure': 7, 'humidity': 9}
class DumbReadingAgent(Agent):
"""A dumb agent to demonstrate workers reading other worker values."""
def get_next_results(self):
# get hvac 1 results:
hvac1_results = self.shared_dictionary.get('hvac 1')
if hvac1_results is None:
return None
return hvac1_results[-1][1]['temperature']
# Script starts.
results = {}
# The "with" ensures we terminate the manager at the end.
with Manager() as manager:
# the manager is a subprocess in its own right. We can ask
# it to manage a dictionary (or other python types) for us
# to be shared among the other children.
shared_info = manager.dict()
hvac_agent1 = HVACAgent('hvac 1', shared_info)
hvac_agent2 = HVACAgent('hvac 2', shared_info, delay=0.1)
dumb_agent = DumbReadingAgent('dumb hvac1 reader', shared_info)
agents = (hvac_agent1, hvac_agent2, dumb_agent)
list(map(lambda a: a.start(), agents))
sleep(1)
list(map(lambda a: a.stop(), agents))
list(map(lambda a: a.join(), agents))
# Not quite sure what happens to the shared dictionary after
# the manager dies, so for safety make a local copy.
results = dict(shared_info)
pprint(results)
I have this class called DecayingSet which is a deque with expiration
class DecayingSet:
def __init__(self, timeout): # timeout in seconds
from collections import deque
self.timeout = timeout
self.d = deque()
self.present = set()
def add(self, thing):
# Return True if `thing` not already in set,
# else return False.
result = thing not in self.present
if result:
self.present.add(thing)
self.d.append((time(), thing))
self.clean()
return result
def clean(self):
# forget stuff added >= `timeout` seconds ago
now = time()
d = self.d
while d and now - d[0][0] >= self.timeout:
_, thing = d.popleft()
self.present.remove(thing)
I'm trying to use it inside a running script, that connects to a streaming api.
The streaming api is returning urls that I am trying to put inside the deque to limit them from entering the next step of the program.
class CustomStreamListener(tweepy.StreamListener):
def on_status(self, status, include_entities=True):
longUrl = status.entities['urls'][0]['expanded_url']
limit = DecayingSet(86400)
l = limit.add(longUrl)
print l
if l == False:
pass
else:
r = requests.get("http://api.some.url/show?url=%s"% longUrl)
When i use this class in an interpreter, everything is good.
But when the script is running, and I repeatedly send in the same url, l returns True every time indicating that the url is not inside the set, when is supposed to be. What gives?
Copying my comment ;-) I think the indentation is screwed up, but it looks like you're creating a brand new limit object every time on_status() is called. Then of course it would always return True: you'd always be starting with an empty limit.
Regardless, change this:
l = limit.add(longUrl)
print l
if l == False:
pass
else:
r = requests.get("http://api.some.url/show?url=%s"% longUrl)
to this:
if limit.add(longUrl):
r = requests.get("http://api.some.url/show?url=%s"% longUrl)
Much easier to follow. It's usually the case that when you're comparing something to a literal True or False, the code can be made more readable.
Edit
i just saw in the interpreter the var assignment is the culprit.
How would I use the same obj?
You could, for example, create the limit object at the module level. Cut and paste ;-)
I would like to create a data structure which represents a set of queues (ideally a hash, map, or dict like lookup) where messages in the queues are being actively removed after they've reached a certain age. The ttl value would be global; messages would not need nor have individual ttl's. The resolution for the ttl doesn't need to be terribly accurate - only within a second or so.
I'm not even sure what to search for here. I could create a separate global queue that a background thread is monitoring, peeking and pulling pointers to messages off the global queue that tell it to remove items from the individual queues, but the behavior needs to go both ways. If an item gets removed from an invidual queue, it needs to remove from the global queue.
I would like for this data structure to be implemented in Python, ideally, and as always, speed is of the utmost importance (more so than memory usage). Any suggestions for where to start?
I'd start by just modeling the behavior you're looking for in a single class, expressed as simply as possible. Performance can come later on through iterative optimization, but only if necessary (you may not need it).
The class below does something roughly like what you're describing. Queues are simply lists that are named and stored in dictionary. Each message is timestamped and inserted at the front of the list (FIFO). Messages are reaped by checking the timestamp of the message at the end of the list, and popping it until it hits a message that is below the age threshold.
If you plan to access this from several threads you'll need to add some fine-grained locking to squeeze the most performance out of it. For example, the reap() method should only lock 1 queue at a time, rather than locking all queues (method-level synchronization), so you'd also need to keep a lock for each named queue.
Updated -- Now uses a global set of buckets (by timestamp, 1 second resolution) to keep track of which queues have messages from that time. This reduces the number of queues to be checked on each pass.
import time
from collections import defaultdict
class QueueMap(object):
def __init__(self):
self._expire = defaultdict(lambda *n: defaultdict(int))
self._store = defaultdict(list)
self._oldest_key = int(time.time())
def get_queue(self, name):
return self._store.get(name, [])
def pop(self, name):
queue = self.get_queue(name)
if queue:
key, msg = queue.pop()
self._expire[key][name] -= 1
return msg
return None
def set(self, name, message):
key = int(time.time())
# increment count of messages in this bucket/queue
self._expire[key][name] += 1
self._store[name].insert(0, (key, message))
def reap(self, age):
now = time.time()
threshold = int(now - age)
oldest = self._oldest_key
# iterate over buckets we need to check
for key in range(oldest, threshold + 1):
# for each queue with items, expire the oldest ones
for name, count in self._expire[key].iteritems():
if count <= 0:
continue
queue = self.get_queue(name)
while queue:
if queue[-1][0] > threshold:
break
queue.pop()
del self._expire[key]
# set oldest_key for next pass
self._oldest_key = threshold
Usage:
qm = QueueMap()
qm.set('one', 'message 1')
qm.set('one', 'message 2')
qm.set('two', 'message 3')
print qm.pop('one')
print qm.get_queue('one')
print qm.get_queue('two')
# call this on a background thread which sleeps
time.sleep(2)
# reap messages older than 1 second
qm.reap(1)
# queues should be empty now
print qm.get_queue('one')
print qm.get_queue('two')
Consider checking the TTLs whenever you access the queues instead of using a thread to be constantly checking. I'm not sure what you mean about the hash/map/dict (what is the key?), but how about something like this:
import time
class EmptyException(Exception): pass
class TTLQueue(object):
TTL = 60 # seconds
def __init__(self):
self._queue = []
def push(self, msg):
self._queue.append((time.time()+self.TTL, msg))
def pop(self):
self._queue = [(t, msg) for (t, msg) in self._queue if t > time.time()]
if len(self._queue) == 0:
raise EmptyException()
return self._queue.pop(0)[1]
queues = [TTLQueue(), TTLQueue(), TTLQueue()] # this could be a dict or set or
# whatever if I knew what keys
# you expected