write data to JSON file during multiprocessing using python

write data to JSON file during multiprocessing using python - python

I am new to python. I am writing a python program to write to a JSON file if the website is unreachable. The multiple websites will be stored in hosts variable. It will be scheduled to check every 5 seconds. I have used pool from multiprocessing to process the website at the same time without delay. After that, i will write the data to the json file. But in here, it is writing only one website data to json file. So how to make this to write two data at the same time.
Here's the sample code:
import os
from multiprocessing import Pool
from datetime import datetime
import time
import json
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
My output:
""March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},
}
Required Output:
{"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.google.com", "monitor.status": "down"},
}

Ive made some minor changes to your code and implemented a Lock.
import os
from multiprocessing import Pool,RLock
from datetime import datetime
import time
import json
file_lock=RLock()
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with file_lock:
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
However, for a long running process that constantly has to read and write a file for logging seems like a poor implementation as the code will have to read a bulky file and completely rewrite it on every process. Consider writing the log in a database instead.

Here's a different option that will use Thread over Pool.
Created a class to get the return of join()
# Class that overwrite Thread to get the return of join()
class ThreadWithReturnValue(Thread):
def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, Verbose=None):
if args is None:
args = ()
if kwargs is None:
kwargs = {}
super().__init__(group, target, name, args, kwargs)
self._return = None
def run(self):
print(type(self._target))
if self._target is not None:
self._return = self._target(*self._args, **self._kwargs)
def join(self, *args):
Thread.join(self, *args)
return self._return
I have changed the code to get the status of each hosts first, then writing the result to your file. Also fixed the way the JSON file is written.
import os
from datetime import datetime
import time
import json
from threading import Thread
hosts = ["www.google.com","www.smackcoders.com"]
filepath = os.path.join(os.getcwd(), "stack.json")
n = len(hosts)
def perform_ping(host_ip):
"""
You have hardcoded down, this method will ping to check if we get an ICMP response
"""
response = os.system("ping -c 1 " + host_ip)
if response == 0:
return 'UP'
else:
return 'DOWN'
def write_result(timestamp, results):
# u = "down" Using perform_ping to get the status
if not os.path.exists(filepath):
current_file = {}
else:
# If file exist, reading the current output
with open(filepath, 'r') as f_read:
current_file = json.loads(f_read.read())
inner_result = []
for result in results:
host, status = result
inner_result.append({'monitor.status': status,
"monitor.id": "tcp-tcp#"+host
})
current_file[timestamp] = inner_result
# writing the file with new input
with open(filepath, 'w') as f_write:
f_write.write(json.dumps(current_file))
def main():
while True:
thread_list = []
for host_ip in hosts:
thread_list.append(ThreadWithReturnValue(target=perform_ping, name=host_ip, args=(host_ip, )))
results = []
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
for thread in thread_list:
thread.start()
for thread in thread_list:
results.append((thread.name, thread.join()))
# Ping is done in parallel, writing the result at the end to avoid thread collision and reading/writing the file to many times if you increase the number of host
write_result(timestamp, results)
time.sleep(5)
if __name__ == '__main__':
main()

Related

multithread pinging of IP address in Python

I have a list of IP addresses like 1000 no's. I am reading the ip_file.txt and storing the result file as result_date.txt. Below is the code that I achieved the result. But my issue is it's taking too long to execute the entire files. Can anyone suggest multithreading, please so that the desired result can be achieved quickly? Thanks in advance.
#!/usr/bin/env python
import os
import csv
import paramiko
from datetime import datetime
import time
import sys
import re
from collections import defaultdict
# Verifies your os type
from paramiko import file
OS_TYPE = os.name
# Sets the count modifier to the os type
count = '-n' if OS_TYPE == 'nt' else '-c'
def create_ip_list():
ip_list = []
with open("ip_file.txt", "r") as file:
for line in file:
ip_list.append(line.strip())
return ip_list
# fetching data
now = datetime.now()
dat = now.strftime("%d/%m/%Y")
# time = now.strftime("%H:%M:%S")
date_string = dat.replace('/', '-')
timestr = time.strftime("%d%m%Y-%H%M%S")
def ping_device(ip_list):
"""Ping ip_list and return results
return: None
rtype: None
"""
results_file = open("results_" + str(timestr) + ".txt", "w")
for ip in ip_list:
response = os.popen(f"ping {ip} {count} 1").read()
time.sleep(1.5)
#fetch Average time
print(response)
for i in response.split("\n"):
para = i.split("=")
try:
if para[0].strip() == "Minimum":
latency = para[3].strip()
print(latency)
# output1=latency[0:8].split(" ")
# test=output1[0]
# print(test)
except:
print("time run")
if "Received = 1" and "Approximate" in response:
#print(f"UP {ip} Ping Successful")
results_file.write(f"{ip},UP,{latency}" + "\n")
else:
print(f"Down {ip} Ping Unsuccessful")
results_file.write(f"{ip} Down" + "\n")
results_file.close()
if __name__ == "__main__":
ping_device(create_ip_list())

Write a function ping_one_device that takes a single ip and returns a single string giving the status. It should be easy to pull this out of ping_device.
Then
with open(results_file, "w") as results_file:
with ThreadPoolExecutor() as executor:
for result in map(ping_one_device, ip_list):
results_file.write(result)

Use Threading in my Python code - Simple Ping and NSLOOKUP

I have created a script to run a simple ping and nslookup test and it works fine. The only problem is, it takes huge amount of time if I have lot of devices. One option I came across is to use Threading concept. Unfortunately, after lot of research only thing I realized is that Python beginners and Threading don't go along well. I was hoping if I can use some help and actually see how it works in my code so that I could apply it in my further programs too. I tried using few lines of multiprocessing code in my program but I guess it's not working.
This is my code:
import csv
import subprocess
import socket
from multiprocessing import Pool
class Devices:
def __init__(self, name):
self.name = name
def hostname(self):
if ".com" in self.name:
return self.name.split('.')[0]
else:
return self.name
def pingtest(self):
response = subprocess.Popen(['ping.exe', device.hostname()], stdout=subprocess.PIPE).communicate()[0]
response = response.decode()
if 'bytes=32' in response:
return 'Up'
else:
return 'Down'
def nslookup(self):
try:
name = socket.getfqdn(device.hostname())
return name
except socket.error:
return 'Error'
def initializefile(file):
with open('Book1.csv', 'r', newline='') as i:
return convertrows(csv.DictReader(i))
def convertrows(rows):
return [Devices(row['Device_Name']) for row in rows]
file = r"My\Book1.csv"
devices = initializefile(file)
with open('Output_PingTest_Threading.csv', 'w', newline='') as csvoutput:
fieldnames = ['Device', 'Ping Test', 'NSLOOKUP']
output = csv.DictWriter(csvoutput, fieldnames=fieldnames)
output.writeheader()
for device in devices:
with open('Output_PingTest_Threading.csv', 'a', newline='') as csvoutput:
output = csv.writer(csvoutput)
output.writerows([[device.name] + [device.pingtest()] + [device.nslookup()]])
print("Device: %s" % device.name)
print("Ping Status: %s" % device.pingtest())
print("NSLOOKUP: %s\n" % device.nslookup())
if __name__ == '__main__':
pool = Pool()
pool.map(device.pingtest(), device.nslookup(), device)
pool.close()
pool.join()
Basically, I am only looking to create 2 threads for the 2 functions(pingtest and nslookup), maybe if I could get the hang if it, I can use it in other programs as well.

So I was able to create threads for each of the function and it did reduce the execution time by almost 50%, although I feel this could be reduced to a lot more than this; guys any help is appreciated!
CODE WITHOUT THREADING:
import csv
import subprocess
import time
import socket
class Devices:
def __init__(self, name):
self.name = name
def pingtest(self):
response = subprocess.Popen(['ping.exe', device.name], stdout=subprocess.PIPE).communicate()[0]
response = response.decode()
if 'bytes=32' in response:
return 'Up'
else:
return 'Down'
def nslookup(self):
name = socket.getfqdn(device.name)
return(name)
def initializefile(file):
with open('List_of_6_Devices.csv', 'r', newline='') as i:
return convertrows(csv.DictReader(i))
def convertrows(rows):
return [Devices(row['New Name']) for row in rows]
file = r"My\List_of_6_Devices.csv"
devices = initializefile(file)
_start = time.time()
for device in devices:
#_start = time.time()
device.pingtest()
print("Device: %s" % device.name)
print("Ping Status: %s" % device.pingtest())
print("FQDN: %s" % device.nslookup())
print("TOTAL EXECUTION TIME", (time.time() - _start))
OUTPUT:
{PING STATUS OF 6 DEVICE HERE }
TOTAL EXECUTION TIME 41.68950819969177
CODE WITH THREADING:
import threading
import csv
import subprocess
import socket
import time
def ping():
response = subprocess.Popen(['ping.exe', device], stdout=subprocess.PIPE).communicate()[0]
response = response.decode()
if 'bytes=32' in response:
status = 'Up'
print("Ping status: %s\n" % status)
else:
status = 'Down'
print("Ping status: %s\n" % status)
def nsloookup():
name = socket.getfqdn(device)
print("FQDN: %s" % name)
def initializefile(file):
with open('List_of_6_Devices.csv', 'r') as f:
return convertrows(csv.DictReader(f))
def convertrows(rows):
return [(row['New Name']) for row in rows]
file = r"My\List_of_6_Devices.csv"
devices = initializefile(file)
if __name__ == "__main__":
# creating thread
_start = time.time()
for device in devices:
t1 = threading.Thread(target=ping)
t2 = threading.Thread(target=nsloookup())
# starting thread 1
t1.start()
# starting thread 2
t2.start()
# wait until thread 1 is completely executed
t1.join()
# wait until thread 2 is completely executed
t2.join()
# both threads completely executed
print("TOTAL EXECUTION TIME", (time.time() - _start))
OUTPUT:
{PING STATUS OF 6 DEVICES}
TOTAL EXECUTION TIME 24.59475827217102

Capture and display console output of function at the same time

Right now I use this to catch the output of a Python function and store it in a variable:
import io
from contextlib import redirect_stdout
def catch_output(func):
result = io.StringIO()
with redirect_stdout(result):
func()
return result.getvalue()
output = catch_output(my_func)
This works fine, but it also mutes the console until the func call finished.
Does anybody know if I can write/pipe the live output of the func to the console and store it in a variable at the same time?

You can redirect stdout to a custom file-like object that forwards writes to multiple files:
import contextlib
import io
import sys
class TeeIO:
def __init__(self, original, target):
self.original = original
self.target = target
def write(self, b):
self.original.write(b)
self.target.write(b)
#contextlib.contextmanager
def tee_stdout(target):
tee = TeeIO(sys.stdout, target)
with contextlib.redirect_stdout(tee):
yield
buf = io.StringIO()
with tee_stdout(buf):
print("foo")
print(buf.getvalue())

This is what I ended up using. I thought I leave this here for people who have a hard time with classes and oop, like me.
import sys
import io
from contextlib import redirect_stdout
def get_multi_writer(streams):
writer = type('obj', (object,), {})
writer.write = lambda s: [stream.write(s) for stream in streams]
return writer
def catch_output(func, args, kwargs):
streams = [sys.stdout, io.StringIO()]
with redirect_stdout(get_multi_writer(streams)):
func(*args, **kwargs)
return streams[1].getvalue()
print(catch_output(my_func, [], {}))

As per the suggestions from the comments I've made and example turning our function into a thread so we can simultaneously check for output from that function periodically and copy it to the real stdout.
import sys
import time
import threading
from cStringIO import StringIO
def foo(n):
for x in range(n):
time.sleep(1) #intense computation
print('test: {}'.format(n))
#i'm using python 2.7 so I don't have contextlib.redirect_stdout
realstdout = sys.stdout
sys.stdout = StringIO()
t = threading.Thread(target=foo, args=(10,))
t.start()
lastpos = 0 #last cursor position in file
while True:
t.join(.1) #wait .1 sec for thread to complete
if sys.stdout.tell() != lastpos: #data has been written to stdout
sys.stdout.seek(lastpos) #go back to our last position
realstdout.write(sys.stdout.read()) #read the data to the real stdout
lastpos = sys.stdout.tell() #update lastpos
if not t.is_alive(): #when we're done
break
sys.stdout.seek(0) #seek back to beginning of file
output = sys.stdout.read() #copy to a usable variable
sys.stdout = realstdout #reset stdout

Multiprocessing Queue.get() hangs

I'm trying to implement basic multiprocessing and I've run into an issue. The python script is attached below.
import time, sys, random, threading
from multiprocessing import Process
from Queue import Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue(10)
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
item = append_queue.get()
database.append(item)
print("Appended to database in %.4f seconds" % database.append_time)
append_queue.task_done()
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
append_queue.join()
#append_queue_process.join()
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
The AnalyzeFrequency analyzes the frequencies of words in a file and get_results() returns a sorted list of said words and frequencies. The list is very large, perhaps 10000 items.
This list is then passed to the add_to_append_queue method which adds it to a queue. The process_append_queue takes the items one by one and adds the frequencies to a "database". This operation takes a bit longer than the actual analysis in main() so I am trying to use a seperate process for this method. When I try and do this with the threading module, everything works perfectly fine, no errors. When I try and use Process, the script hangs at item = append_queue.get().
Could someone please explain what is happening here, and perhaps direct me toward a fix?
All answers appreciated!
UPDATE
The pickle error was my fault, it was just a typo. Now I am using the Queue class within multiprocessing but the append_queue.get() method still hangs.
NEW CODE
import time, sys, random
from multiprocessing import Process, Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue()
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
database.append(append_queue.get())
print("Appended to database in %.4f seconds" % database.append_time)
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
#append_queue.join()
#append_queue_process.join()
print str(append_queue.qsize())
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
UPDATE 2
This is the database code:
class FrequencyStore:
def __init__(self):
self.sorter = Sorter()
self.db = {}
self.load_time = -1
self.save_time = -1
self.append_time = -1
self.sort_time = -1
def load_db(self):
start_time = time.time()
try:
file = open("results.txt", 'r')
except:
raise IOError
self.db = {}
for line in file:
word, count = line.strip("\n").split("=")
self.db[word] = int(count)
file.close()
self.load_time = time.time() - start_time
def save_db(self):
start_time = time.time()
_db = []
for key in self.db:
_db.append([key, self.db[key]])
_db = self.sort(_db)
try:
file = open("results.txt", 'w')
except:
raise IOError
file.truncate(0)
for x in _db:
file.write(x[0] + "=" + str(x[1]) + "\n")
file.close()
self.save_time = time.time() - start_time
def create_sorted_db(self):
_temp_db = []
for key in self.db:
_temp_db.append([key, self.db[key]])
_temp_db = self.sort(_temp_db)
_temp_db.reverse()
return _temp_db
def get_db(self):
return self.db
def sort(self, _list):
start_time = time.time()
_list = self.sorter.mergesort(_list)
_list.reverse()
self.sort_time = time.time() - start_time
return _list
def append(self, _list):
start_time = time.time()
for x in _list:
if x[0] not in self.db:
self.db[x[0]] = x[1]
else:
self.db[x[0]] += x[1]
self.append_time = time.time() - start_time

Comments suggest you're trying to run this on Windows. As I said in a comment,
If you're running this on Windows, it can't work - Windows doesn't
have fork(), so each process gets its own Queue and they have nothing
to do with each other. The entire module is imported "from scratch" by
each process on Windows. You'll need to create the Queue in main(),
and pass it as an argument to the worker function.
Here's fleshing out what you need to do to make it portable, although I removed all the database stuff because it's irrelevant to the problems you've described so far. I also removed the daemon fiddling, because that's usually just a lazy way to avoid shutting down things cleanly, and often as not will come back to bite you later:
def process_append_queue(append_queue):
while True:
x = append_queue.get()
if x is None:
break
print("processed %d" % x)
print("worker done")
def main():
import multiprocessing as mp
append_queue = mp.Queue(10)
append_queue_process = mp.Process(target=process_append_queue, args=(append_queue,))
append_queue_process.start()
for i in range(100):
append_queue.put(i)
append_queue.put(None) # tell worker we're done
append_queue_process.join()
if __name__=="__main__":
main()
The output is the "obvious" stuff:
processed 0
processed 1
processed 2
processed 3
processed 4
...
processed 96
processed 97
processed 98
processed 99
worker done
Note: because Windows doesn't (can't) fork(), it's impossible for worker processes to inherit any Python object on Windows. Each process runs the entire program from its start. That's why your original program couldn't work: each process created its own Queue, wholly unrelated to the Queue in the other process. In the approach shown above, only the main process creates a Queue, and the main process passes it (as an argument) to the worker process.

queue.Queue is thread-safe, but doesn't work across processes. This is quite easy to fix, though. Instead of:
from multiprocessing import Process
from Queue import Queue
You want:
from multiprocessing import Process, Queue

Python multiprocessing pool memory issue

I want to run a python process that is always alive in the background monitoring if some XMLs are updated. However, I am getting python [defunct] after a while. I suspect it's memory related issue because I can see the memory for the child process keeps on growing. But I am not sure how to fix that. Here's the code:
def getXmlFile(url):
req = urllib2.Request(url)
rep = urllib2.urlopen(req)
response_content = rep.read()
last_modified = rep.info().getheader('Last-Modified')
return (response_content, last_modified)
def writeXmlFile(content, fileName):
file_path = os.path.join(OUTPUT, fileName)
f = open(file_path, 'w')
f.write(content)
f.close()
def downloadXml(xml):
url = BASE_URL + xml
(response, last_modified) = getXmlFile(url)
if LAST_MODIFIED.get(xml) is None or LAST_MODIFIED[xml] != last_modified:
LAST_MODIFIED[xml] = last_modified
utc_datetime = datetime.datetime.utcnow()
currentTime = utc_datetime.strftime('%Y%m%d%H%M%S')
fileName = xml.replace('.XML', '_%s.XML' % currentTime)
print "Writing " + fileName
writeXmlFile(response, fileName)
def main():
pool = Pool(processes=1)
while (True):
pool.map_async(downloadXml, XmlList)
time.sleep(2)
pool.close()
pool.join()
if __name__ == "__main__":
main()
Everything is working fine initially. It's monitoring certain xmls every 2 seconds and if there's an update, it will download the xml. But it will hang after a while. Ps aux would show the child python process with python [defunc].

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

write data to JSON file during multiprocessing using python - python

Related

multithread pinging of IP address in Python

Use Threading in my Python code - Simple Ping and NSLOOKUP

Capture and display console output of function at the same time

Multiprocessing Queue.get() hangs

Python multiprocessing pool memory issue

Categories

Resources