Python multiprocessing pool memory issue - python

I want to run a python process that is always alive in the background monitoring if some XMLs are updated. However, I am getting python [defunct] after a while. I suspect it's memory related issue because I can see the memory for the child process keeps on growing. But I am not sure how to fix that. Here's the code:
def getXmlFile(url):
req = urllib2.Request(url)
rep = urllib2.urlopen(req)
response_content = rep.read()
last_modified = rep.info().getheader('Last-Modified')
return (response_content, last_modified)
def writeXmlFile(content, fileName):
file_path = os.path.join(OUTPUT, fileName)
f = open(file_path, 'w')
f.write(content)
f.close()
def downloadXml(xml):
url = BASE_URL + xml
(response, last_modified) = getXmlFile(url)
if LAST_MODIFIED.get(xml) is None or LAST_MODIFIED[xml] != last_modified:
LAST_MODIFIED[xml] = last_modified
utc_datetime = datetime.datetime.utcnow()
currentTime = utc_datetime.strftime('%Y%m%d%H%M%S')
fileName = xml.replace('.XML', '_%s.XML' % currentTime)
print "Writing " + fileName
writeXmlFile(response, fileName)
def main():
pool = Pool(processes=1)
while (True):
pool.map_async(downloadXml, XmlList)
time.sleep(2)
pool.close()
pool.join()
if __name__ == "__main__":
main()
Everything is working fine initially. It's monitoring certain xmls every 2 seconds and if there's an update, it will download the xml. But it will hang after a while. Ps aux would show the child python process with python [defunc].

Related

write data to JSON file during multiprocessing using python

I am new to python. I am writing a python program to write to a JSON file if the website is unreachable. The multiple websites will be stored in hosts variable. It will be scheduled to check every 5 seconds. I have used pool from multiprocessing to process the website at the same time without delay. After that, i will write the data to the json file. But in here, it is writing only one website data to json file. So how to make this to write two data at the same time.
Here's the sample code:
import os
from multiprocessing import Pool
from datetime import datetime
import time
import json
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
My output:
""March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},
}
Required Output:
{"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.google.com", "monitor.status": "down"},
}
Ive made some minor changes to your code and implemented a Lock.
import os
from multiprocessing import Pool,RLock
from datetime import datetime
import time
import json
file_lock=RLock()
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with file_lock:
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
However, for a long running process that constantly has to read and write a file for logging seems like a poor implementation as the code will have to read a bulky file and completely rewrite it on every process. Consider writing the log in a database instead.
Here's a different option that will use Thread over Pool.
Created a class to get the return of join()
# Class that overwrite Thread to get the return of join()
class ThreadWithReturnValue(Thread):
def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, Verbose=None):
if args is None:
args = ()
if kwargs is None:
kwargs = {}
super().__init__(group, target, name, args, kwargs)
self._return = None
def run(self):
print(type(self._target))
if self._target is not None:
self._return = self._target(*self._args, **self._kwargs)
def join(self, *args):
Thread.join(self, *args)
return self._return
I have changed the code to get the status of each hosts first, then writing the result to your file. Also fixed the way the JSON file is written.
import os
from datetime import datetime
import time
import json
from threading import Thread
hosts = ["www.google.com","www.smackcoders.com"]
filepath = os.path.join(os.getcwd(), "stack.json")
n = len(hosts)
def perform_ping(host_ip):
"""
You have hardcoded down, this method will ping to check if we get an ICMP response
"""
response = os.system("ping -c 1 " + host_ip)
if response == 0:
return 'UP'
else:
return 'DOWN'
def write_result(timestamp, results):
# u = "down" Using perform_ping to get the status
if not os.path.exists(filepath):
current_file = {}
else:
# If file exist, reading the current output
with open(filepath, 'r') as f_read:
current_file = json.loads(f_read.read())
inner_result = []
for result in results:
host, status = result
inner_result.append({'monitor.status': status,
"monitor.id": "tcp-tcp#"+host
})
current_file[timestamp] = inner_result
# writing the file with new input
with open(filepath, 'w') as f_write:
f_write.write(json.dumps(current_file))
def main():
while True:
thread_list = []
for host_ip in hosts:
thread_list.append(ThreadWithReturnValue(target=perform_ping, name=host_ip, args=(host_ip, )))
results = []
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
for thread in thread_list:
thread.start()
for thread in thread_list:
results.append((thread.name, thread.join()))
# Ping is done in parallel, writing the result at the end to avoid thread collision and reading/writing the file to many times if you increase the number of host
write_result(timestamp, results)
time.sleep(5)
if __name__ == '__main__':
main()

How to spin a new thread if data is available in a queue to process.

I have a function that zip streams data into a bytebuffer, from that bytebuffer I create 5000lines/chunks, now I am trying to write these chunks back to s3 bucket in separate files, since I am using AWS Lambda I have cannot let single thread handle all the workflow as there 5 minute constraint after which AWS Lambda times out, coming from Java background where threads are pretty simple to implement but in python I am getting confused how to execute pool of thread to take care of uploading file to s3 part of my process, here is my code:
import io
import zipfile
import boto3
import sys
import multiprocessing
# from multiprocessing.dummy import Pool as ThreadPool
import time
s3_client = boto3.client('s3')
s3 = boto3.resource('s3', 'us-east-1')
def stream_zip_file():
# pool = ThreadPool(threads)
start_time_main = time.time()
start_time_stream = time.time()
obj = s3.Object(
bucket_name='monkey-business-dev-data',
key='sample-files/daily/banana/large/banana.zip'
)
end_time_stream = time.time()
# process_queue = multiprocessing.Queue()
buffer = io.BytesIO(obj.get()["Body"].read())
output = io.BytesIO()
print (buffer)
z = zipfile.ZipFile(buffer)
foo2 = z.open(z.infolist()[0])
print(sys.getsizeof(foo2))
line_counter = 0
file_clounter = 0
for line in foo2:
line_counter += 1
output.write(line)
if line_counter >= 5000:
file_clounter += 1
line_counter = 0
# pool.map(upload_to_s3, (output, file_clounter))
# upload_to_s3(output, file_clounter)
# process_queue.put(output)
output.close()
output = io.BytesIO()
if line_counter > 0:
# process_queue.put(output)
# upload_to_s3(output, file_clounter)
# pool.map(upload_to_s3, args =(output, file_clounter))
output.close()
print('Total Files: {}'.format(file_clounter))
print('Total Lines: {}'.format(line_counter))
output.seek(0)
start_time_upload = time.time()
end_time_upload = time.time()
output.close()
z.close()
end_time_main = time.time()
print('''
main: {}
stream: {}
upload: {}
'''.format((end_time_main-start_time_main),(end_time_stream-start_time_stream),(end_time_upload-start_time_upload)))
def upload_to_s3(output, file_name):
output.seek(0)
s3_client.put_object(
Bucket='monkey-business-dev-data', Key='sample-files/daily/banana/large/{}.txt'.format(file_name),
ServerSideEncryption='AES256',
Body=output,
ACL='bucket-owner-full-control'
)
# consumer_process = multiprocessing.Process(target=data_consumer, args=(process_queue))
# consumer_process.start()
#
#
# def data_consumer(queue):
# while queue.empty() is False:
if __name__ == '__main__':
stream_zip_file()
Now I have tried several ways to do it, my specific requirement is to have threadpool with size of 10 threads and these threads would always pool a queue, if chunk is available to upload on queue thread would execute and start uploading the chunk meanwhile one thread would always continuously pool the queue for new chunk and if chunk gets available a new thread (if thread 1 is still busy in s3 upload) will automatically start and upload the file to s3 and so on. I have checked many answers here and on google but nothing seems to work or make sense to my feeble mind.

Multiprocessing Queue.get() hangs

I'm trying to implement basic multiprocessing and I've run into an issue. The python script is attached below.
import time, sys, random, threading
from multiprocessing import Process
from Queue import Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue(10)
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
item = append_queue.get()
database.append(item)
print("Appended to database in %.4f seconds" % database.append_time)
append_queue.task_done()
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
append_queue.join()
#append_queue_process.join()
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
The AnalyzeFrequency analyzes the frequencies of words in a file and get_results() returns a sorted list of said words and frequencies. The list is very large, perhaps 10000 items.
This list is then passed to the add_to_append_queue method which adds it to a queue. The process_append_queue takes the items one by one and adds the frequencies to a "database". This operation takes a bit longer than the actual analysis in main() so I am trying to use a seperate process for this method. When I try and do this with the threading module, everything works perfectly fine, no errors. When I try and use Process, the script hangs at item = append_queue.get().
Could someone please explain what is happening here, and perhaps direct me toward a fix?
All answers appreciated!
UPDATE
The pickle error was my fault, it was just a typo. Now I am using the Queue class within multiprocessing but the append_queue.get() method still hangs.
NEW CODE
import time, sys, random
from multiprocessing import Process, Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue()
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
database.append(append_queue.get())
print("Appended to database in %.4f seconds" % database.append_time)
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
#append_queue.join()
#append_queue_process.join()
print str(append_queue.qsize())
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
UPDATE 2
This is the database code:
class FrequencyStore:
def __init__(self):
self.sorter = Sorter()
self.db = {}
self.load_time = -1
self.save_time = -1
self.append_time = -1
self.sort_time = -1
def load_db(self):
start_time = time.time()
try:
file = open("results.txt", 'r')
except:
raise IOError
self.db = {}
for line in file:
word, count = line.strip("\n").split("=")
self.db[word] = int(count)
file.close()
self.load_time = time.time() - start_time
def save_db(self):
start_time = time.time()
_db = []
for key in self.db:
_db.append([key, self.db[key]])
_db = self.sort(_db)
try:
file = open("results.txt", 'w')
except:
raise IOError
file.truncate(0)
for x in _db:
file.write(x[0] + "=" + str(x[1]) + "\n")
file.close()
self.save_time = time.time() - start_time
def create_sorted_db(self):
_temp_db = []
for key in self.db:
_temp_db.append([key, self.db[key]])
_temp_db = self.sort(_temp_db)
_temp_db.reverse()
return _temp_db
def get_db(self):
return self.db
def sort(self, _list):
start_time = time.time()
_list = self.sorter.mergesort(_list)
_list.reverse()
self.sort_time = time.time() - start_time
return _list
def append(self, _list):
start_time = time.time()
for x in _list:
if x[0] not in self.db:
self.db[x[0]] = x[1]
else:
self.db[x[0]] += x[1]
self.append_time = time.time() - start_time
Comments suggest you're trying to run this on Windows. As I said in a comment,
If you're running this on Windows, it can't work - Windows doesn't
have fork(), so each process gets its own Queue and they have nothing
to do with each other. The entire module is imported "from scratch" by
each process on Windows. You'll need to create the Queue in main(),
and pass it as an argument to the worker function.
Here's fleshing out what you need to do to make it portable, although I removed all the database stuff because it's irrelevant to the problems you've described so far. I also removed the daemon fiddling, because that's usually just a lazy way to avoid shutting down things cleanly, and often as not will come back to bite you later:
def process_append_queue(append_queue):
while True:
x = append_queue.get()
if x is None:
break
print("processed %d" % x)
print("worker done")
def main():
import multiprocessing as mp
append_queue = mp.Queue(10)
append_queue_process = mp.Process(target=process_append_queue, args=(append_queue,))
append_queue_process.start()
for i in range(100):
append_queue.put(i)
append_queue.put(None) # tell worker we're done
append_queue_process.join()
if __name__=="__main__":
main()
The output is the "obvious" stuff:
processed 0
processed 1
processed 2
processed 3
processed 4
...
processed 96
processed 97
processed 98
processed 99
worker done
Note: because Windows doesn't (can't) fork(), it's impossible for worker processes to inherit any Python object on Windows. Each process runs the entire program from its start. That's why your original program couldn't work: each process created its own Queue, wholly unrelated to the Queue in the other process. In the approach shown above, only the main process creates a Queue, and the main process passes it (as an argument) to the worker process.
queue.Queue is thread-safe, but doesn't work across processes. This is quite easy to fix, though. Instead of:
from multiprocessing import Process
from Queue import Queue
You want:
from multiprocessing import Process, Queue

apache/flask - when uploading file, requests are beeing qued, server unresponsive & memory gets high

I'm running Flask on Apache with mod_wsgi, every time a file get uploaded, the response time of the server worses, and my memory goes over 100% to swap.
This below is the whole code that handles my upload
#app.route(sApiRoute + '/f/upload', methods=['POST'])
#hasToken
def pbFileUpload(token=None,**kwargs):
def log_time(start_dt=None,start_time=None,doc=None):
end_time = None
if start_time is not None:
end_time = str(time.time()-start_time)
if end_time is not None and end_time > str(0.3):
pblog.warning("Upload with long response time. Start: "+str(start_dt)+ "Duration: "+str(end_time)+" \nUploader: " + token["id_user"] + " \nDoc info: " + str(doc))
import time
start_time = time.time()
start_dt = current_datetime()
# parse path
path = getUploadPath(request.json, None, token)
if not path:
return r404()
# Get the name of the uploaded file
filename = request.json["name"]
filename = filename.replace("$c$","č").replace("$c2$","ć").replace("$z$","ž").replace("$s$","š").replace("$d$","đ").replace("$C$","Č").replace("$C2$","Ć").replace("$Z$","Ž").replace("$S$","Š").replace("$D$","Đ")
rawfile = request.json["data"]
header, data = rawfile.split(',')
# convert data
import base64
recoveredfile = base64.decodebytes(bytes(data, 'UTF-8'))
# write data
import uuid
# leave file name as is
if request.json.get("leaveName"):
generatedfilename = filename
leaveName = True
else:
generatedfilename = str(uuid.uuid1())
leaveName = False
# use of ftp to save files
if 'ftp.' in path.lower():
putFileToFtp(path, filename, generatedfilename, recoveredfile)
# classic directory structure
else:
if not os.path.exists(path):
try:
os.mkdir(path)
except Exception as e:
pblog.warning(str(e),exc_info=1)
return jsonify(data={"status":False,"msg":"Path not found"})
while os.path.exists(os.path.join(path, generatedfilename)) and not leaveName:
generatedfilename = str(uuid.uuid1())
with open(os.path.join(path, generatedfilename), "wb") as out_file:
out_file.write(recoveredfile)
out_file.close()
# send to document system
if request.json.get("dataRec"):
db = DBStore.getDB(token["current_company_id"])
dataRec = request.json.get("dataRec")
dataRec["idext"] = generatedfilename
rtd = DocumentCls.getIdByIdent(dataRec["IdDocumentType"],db)
if rtd and rtd!=[]:
if rtd.get("Id"):
dataRec["IdDocumentType"] = rtd["Id"]
r = DocumentCls.saveDocument(dataRec,request.json.get("username"),db)
log_time(start_dt,start_time,dataRec)
return jsonify(data={"version": r,"generatedfilename":generatedfilename},**kwargs)
else:
log_time(start_dt,start_time,generatedfilename)
return jsonify(data={"filename": filename, "generatedfilename": generatedfilename},**kwargs)

using python stream downloading file with server limit

I tried to download file from a server using python, sometimes the file is very large, I would like to have some progress bar, one way to do this I can come up with is to download in a stream, so that I can print the progress. Currently I have tried the standard urlopen, urlretrieve, and requests module (with stream on).
Obviously, urlopen cannot download file in stream, requests module support this, however, the server has limit on the file I can download at one time (its limit is 1). So everytime, I tried to use requests, it only get the webpage told me to wait, is there any other way to do this?
I have very recently downloaded many types of media with this function:
import sys
import requests
import time
def download_resource(domain, url, file_name = None, download = True):
cookies = {}
s = requests.Session()
s.config['keep_alive'] = True
#add your own cookies here, I have a specific function I call
#for my application but yours is different
r = s.get(url, cookies = cookies, stream = True)
if not r.ok:
print "error in downloading"
return -1
file_size = int(r.headers['content-length'])
if not file_name:
try:
temp = r.headers['content-disposition']
except Exception as e:
pass
#failing download
return -1
else:
if not temp:
return -1
else:
file_name = temp.split("filename=")[-1]
return_obj["filename"] = file_name
#print "File size:", file_size
#print "\n", str(self.entire_size / float(1024*1024*1024)), "\n"
print "Downloading:", file_name
if download:
with open(file_name, "wb") as fh:
count = 1
chunk_size = 1048576
start_time = time.time()
try:
for block in r.iter_content(chunk_size):
total_time = time.time() - start_time
percent = count*chunk_size/float(file_size) * 100.0
fraction = int(percent/5)
download_speed = 1.0 / total_time
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d%% %3.2f MB/s " % ('='* fraction , percent, download_speed))
sys.stdout.flush()
if not block:
break
fh.write(block)
count += 1
start_time = time.time()
except Exception as e:
print e
finally:
#close up the stream
r.close()

Categories

Resources