Use Threading in my Python code - Simple Ping and NSLOOKUP - python

I have created a script to run a simple ping and nslookup test and it works fine. The only problem is, it takes huge amount of time if I have lot of devices. One option I came across is to use Threading concept. Unfortunately, after lot of research only thing I realized is that Python beginners and Threading don't go along well. I was hoping if I can use some help and actually see how it works in my code so that I could apply it in my further programs too. I tried using few lines of multiprocessing code in my program but I guess it's not working.
This is my code:
import csv
import subprocess
import socket
from multiprocessing import Pool
class Devices:
def __init__(self, name):
self.name = name
def hostname(self):
if ".com" in self.name:
return self.name.split('.')[0]
else:
return self.name
def pingtest(self):
response = subprocess.Popen(['ping.exe', device.hostname()], stdout=subprocess.PIPE).communicate()[0]
response = response.decode()
if 'bytes=32' in response:
return 'Up'
else:
return 'Down'
def nslookup(self):
try:
name = socket.getfqdn(device.hostname())
return name
except socket.error:
return 'Error'
def initializefile(file):
with open('Book1.csv', 'r', newline='') as i:
return convertrows(csv.DictReader(i))
def convertrows(rows):
return [Devices(row['Device_Name']) for row in rows]
file = r"My\Book1.csv"
devices = initializefile(file)
with open('Output_PingTest_Threading.csv', 'w', newline='') as csvoutput:
fieldnames = ['Device', 'Ping Test', 'NSLOOKUP']
output = csv.DictWriter(csvoutput, fieldnames=fieldnames)
output.writeheader()
for device in devices:
with open('Output_PingTest_Threading.csv', 'a', newline='') as csvoutput:
output = csv.writer(csvoutput)
output.writerows([[device.name] + [device.pingtest()] + [device.nslookup()]])
print("Device: %s" % device.name)
print("Ping Status: %s" % device.pingtest())
print("NSLOOKUP: %s\n" % device.nslookup())
if __name__ == '__main__':
pool = Pool()
pool.map(device.pingtest(), device.nslookup(), device)
pool.close()
pool.join()
Basically, I am only looking to create 2 threads for the 2 functions(pingtest and nslookup), maybe if I could get the hang if it, I can use it in other programs as well.

So I was able to create threads for each of the function and it did reduce the execution time by almost 50%, although I feel this could be reduced to a lot more than this; guys any help is appreciated!
CODE WITHOUT THREADING:
import csv
import subprocess
import time
import socket
class Devices:
def __init__(self, name):
self.name = name
def pingtest(self):
response = subprocess.Popen(['ping.exe', device.name], stdout=subprocess.PIPE).communicate()[0]
response = response.decode()
if 'bytes=32' in response:
return 'Up'
else:
return 'Down'
def nslookup(self):
name = socket.getfqdn(device.name)
return(name)
def initializefile(file):
with open('List_of_6_Devices.csv', 'r', newline='') as i:
return convertrows(csv.DictReader(i))
def convertrows(rows):
return [Devices(row['New Name']) for row in rows]
file = r"My\List_of_6_Devices.csv"
devices = initializefile(file)
_start = time.time()
for device in devices:
#_start = time.time()
device.pingtest()
print("Device: %s" % device.name)
print("Ping Status: %s" % device.pingtest())
print("FQDN: %s" % device.nslookup())
print("TOTAL EXECUTION TIME", (time.time() - _start))
OUTPUT:
{PING STATUS OF 6 DEVICE HERE }
TOTAL EXECUTION TIME 41.68950819969177
CODE WITH THREADING:
import threading
import csv
import subprocess
import socket
import time
def ping():
response = subprocess.Popen(['ping.exe', device], stdout=subprocess.PIPE).communicate()[0]
response = response.decode()
if 'bytes=32' in response:
status = 'Up'
print("Ping status: %s\n" % status)
else:
status = 'Down'
print("Ping status: %s\n" % status)
def nsloookup():
name = socket.getfqdn(device)
print("FQDN: %s" % name)
def initializefile(file):
with open('List_of_6_Devices.csv', 'r') as f:
return convertrows(csv.DictReader(f))
def convertrows(rows):
return [(row['New Name']) for row in rows]
file = r"My\List_of_6_Devices.csv"
devices = initializefile(file)
if __name__ == "__main__":
# creating thread
_start = time.time()
for device in devices:
t1 = threading.Thread(target=ping)
t2 = threading.Thread(target=nsloookup())
# starting thread 1
t1.start()
# starting thread 2
t2.start()
# wait until thread 1 is completely executed
t1.join()
# wait until thread 2 is completely executed
t2.join()
# both threads completely executed
print("TOTAL EXECUTION TIME", (time.time() - _start))
OUTPUT:
{PING STATUS OF 6 DEVICES}
TOTAL EXECUTION TIME 24.59475827217102

Related

write data to JSON file during multiprocessing using python

I am new to python. I am writing a python program to write to a JSON file if the website is unreachable. The multiple websites will be stored in hosts variable. It will be scheduled to check every 5 seconds. I have used pool from multiprocessing to process the website at the same time without delay. After that, i will write the data to the json file. But in here, it is writing only one website data to json file. So how to make this to write two data at the same time.
Here's the sample code:
import os
from multiprocessing import Pool
from datetime import datetime
import time
import json
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
My output:
""March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},
}
Required Output:
{"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.google.com", "monitor.status": "down"},
}
Ive made some minor changes to your code and implemented a Lock.
import os
from multiprocessing import Pool,RLock
from datetime import datetime
import time
import json
file_lock=RLock()
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with file_lock:
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
However, for a long running process that constantly has to read and write a file for logging seems like a poor implementation as the code will have to read a bulky file and completely rewrite it on every process. Consider writing the log in a database instead.
Here's a different option that will use Thread over Pool.
Created a class to get the return of join()
# Class that overwrite Thread to get the return of join()
class ThreadWithReturnValue(Thread):
def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, Verbose=None):
if args is None:
args = ()
if kwargs is None:
kwargs = {}
super().__init__(group, target, name, args, kwargs)
self._return = None
def run(self):
print(type(self._target))
if self._target is not None:
self._return = self._target(*self._args, **self._kwargs)
def join(self, *args):
Thread.join(self, *args)
return self._return
I have changed the code to get the status of each hosts first, then writing the result to your file. Also fixed the way the JSON file is written.
import os
from datetime import datetime
import time
import json
from threading import Thread
hosts = ["www.google.com","www.smackcoders.com"]
filepath = os.path.join(os.getcwd(), "stack.json")
n = len(hosts)
def perform_ping(host_ip):
"""
You have hardcoded down, this method will ping to check if we get an ICMP response
"""
response = os.system("ping -c 1 " + host_ip)
if response == 0:
return 'UP'
else:
return 'DOWN'
def write_result(timestamp, results):
# u = "down" Using perform_ping to get the status
if not os.path.exists(filepath):
current_file = {}
else:
# If file exist, reading the current output
with open(filepath, 'r') as f_read:
current_file = json.loads(f_read.read())
inner_result = []
for result in results:
host, status = result
inner_result.append({'monitor.status': status,
"monitor.id": "tcp-tcp#"+host
})
current_file[timestamp] = inner_result
# writing the file with new input
with open(filepath, 'w') as f_write:
f_write.write(json.dumps(current_file))
def main():
while True:
thread_list = []
for host_ip in hosts:
thread_list.append(ThreadWithReturnValue(target=perform_ping, name=host_ip, args=(host_ip, )))
results = []
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
for thread in thread_list:
thread.start()
for thread in thread_list:
results.append((thread.name, thread.join()))
# Ping is done in parallel, writing the result at the end to avoid thread collision and reading/writing the file to many times if you increase the number of host
write_result(timestamp, results)
time.sleep(5)
if __name__ == '__main__':
main()

What is the most efficient way of sending a sequence of different data types with send_multipart() in ZMQ?

I am trying to use ZeroMQ for multiprocessing. I want to stream files from a tar file so I used the streamer.
Below is an instance of what want to do.
import time
import zmq
from zmq.devices.basedevice import ProcessDevice
from multiprocessing import Process
def server(frontend_port, number_of_workers):
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.connect("tcp://127.0.0.1:%d" % frontend_port)
for i in range(0,10):
socket.send_json('#%s' % i)
for i in range(number_of_workers):
socket.send_json('STOP')
return True
def worker(work_num, backend_port):
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.connect("tcp://127.0.0.1:%d" % backend_port)
while True:
message = socket.recv_json()
if message == 'STOP':
break
print("Worker #%s got message! %s" % (work_num, message))
time.sleep(1)
def main():
frontend_port = 7559
backend_port = 7560
number_of_workers = 2
streamerdevice = ProcessDevice(zmq.STREAMER, zmq.PULL, zmq.PUSH)
streamerdevice.bind_in("tcp://127.0.0.1:%d" % frontend_port )
streamerdevice.bind_out("tcp://127.0.0.1:%d" % backend_port)
streamerdevice.setsockopt_in(zmq.IDENTITY, b'PULL')
streamerdevice.setsockopt_out(zmq.IDENTITY, b'PUSH')
streamerdevice.start()
processes = []
for work_num in range(number_of_workers):
w = Process(target=worker, args=(work_num,backend_port))
processes.append(w)
w.start()
time.sleep(1)
s = Process(target=server, args=(frontend_port,number_of_workers))
s.start()
# server(frontend_port)
s.join()
for w in processes:
w.join()
if __name__ == '__main__':
main()
This code works properly. But I want to use send_multipart() to send a tuple or a list that includes items with different types like [string, numpy_array, integer] but json can't handle numpy arrays. I am avoiding using pickle because I need it to be as fast as possible. I tried to convert the array to bytes too but it didn't work. (maybe I was doing it wrong I am not sure).
I appreciate if you can provide a working snippet of code.
Ideally, I want to do something like this:
socket.send_multipart([string, numpy_array, integer])
So I want to know what is the most efficient way of doing it.
I am using Python 3.6
msgpack and msgpack_numpy are the best option I could find.
Try this:
import time
import zmq
from zmq.devices.basedevice import ProcessDevice
from multiprocessing import Process
import numpy as np
import msgpack
import msgpack_numpy as m
def server(frontend_port, number_of_workers):
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.connect("tcp://127.0.0.1:%d" % frontend_port)
for i in range(0,10):
arr = np.array([[[i,i],[i,i]],[[i,i],[i,i]]])
file_name = 'image file name or any other srting'
number = 10 # just an instance of an integer
msg = msgpack.packb((arr, number, file_name), default=m.encode, use_bin_type=True)
socket.send(msg, copy=False)
time.sleep(1)
for i in range(number_of_workers):
msg = msgpack.packb((b'STOP', b'STOP'), default=m.encode, use_bin_type=True)
socket.send(msg, copy=False)
return True
def worker(work_num, backend_port):
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.connect("tcp://127.0.0.1:%d" % backend_port)
while True:
task = socket.recv()
task = msgpack.unpackb(task, object_hook= m.decode, use_list=False, max_bin_len=50000000, raw=False)
if task[1] == b'STOP':
break
(arr, number, file_name) = task
print("Worker ",work_num, 'got message!', file_name)
return True
def main():
m.patch()
frontend_port = 3559
backend_port = 3560
number_of_workers = 2
streamerdevice = ProcessDevice(zmq.STREAMER, zmq.PULL, zmq.PUSH)
streamerdevice.bind_in("tcp://127.0.0.1:%d" % frontend_port )
streamerdevice.bind_out("tcp://127.0.0.1:%d" % backend_port)
streamerdevice.setsockopt_in(zmq.IDENTITY, b'PULL')
streamerdevice.setsockopt_out(zmq.IDENTITY, b'PUSH')
streamerdevice.start()
processes = []
for work_num in range(number_of_workers):
w = Process(target=worker, args=(work_num,backend_port))
processes.append(w)
w.start()
time.sleep(1)
s = Process(target=server, args=(frontend_port,number_of_workers))
s.start()
s.join()
for w in processes:
w.join()
if __name__ == '__main__':
main()

TimeoutError when two computers communicated

I am using Python3 to learn distributed programming
there are two python fileļ¼Œone's name is main.py, it distributes information, the other one manipulation data, and the name is worker.py.
everything goes well when I run this two file in one computer[set server address = 127.0.0.1, port = 5000]
but when i run these two files in seperate computers, they cannot connect to each other, and TimeoutError was encoutered.
I don't know why. one computer is Win10 at my home, the other is a linux cloud server which I baught.
the code works in one computer. but when I ran main.py in linux, and ran worker.py{change server to linux's ip address} in win10, then the worker.py encounter a TimeoutError
I know nothing about the linux, is there some security settings I need to open or close?
"""main.py"""
import queue
from multiprocessing.managers import BaseManager
import datetime
import time
TASK_QUEUE = queue.Queue()
RESULT_QUEUE = queue.Queue()
def get_task_queue():
"""set TASK_QUEUE as a function"""
global TASK_QUEUE
return TASK_QUEUE
def receive_result_queue():
"""set RESULT_QUEUE as a function"""
global RESULT_QUEUE
return RESULT_QUEUE
class QueueManager(BaseManager):
"""inherit BaseManager from multiprocessing.managers"""
pass
if __name__ == '__main__':
QueueManager.register('distribute_task_queue', callable=get_task_queue)
QueueManager.register('receive_result_queue', callable=receive_result_queue)
# bind port 5000, set verification code = 'abc'
MANAGER = QueueManager(address=('127.0.0.1', 5000), authkey=b'abc')
# start manager
MANAGER.start()
TASK = MANAGER.distribute_task_queue()
RESULT = MANAGER.receive_result_queue()
# put each line into manager`enter code here`
with open("C:/Users/dayia/Desktop/log.20170817") as f:
for line in f:
TASK.put(line)
# try receive result
while 1:
try:
r = RESULT.get(timeout=1)
if r[0] == r[1] and r[0] == "done":
break
else:
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"line %s\'s length is %s" % (r[0], r[1]))
except queue.Empty:
print('result queue is empty.')
#
"""worker.py"""
import datetime
from multiprocessing.managers import BaseManager
import queue
import time
class QueueManager(BaseManager):
"""inherit BaseManager from multiprocessing.managers"""
pass
QueueManager.register('distribute_task_queue')
QueueManager.register('receive_result_queue')
server_addr = '127.0.0.1'
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'Connect to server %s...' % server_addr)
m = QueueManager(address=(server_addr, 5000), authkey=b'abc')
m.connect()
TASK = m.distribute_task_queue()
RESULT = m.receive_result_queue()
def parse_line(line):
return len(line)
C = 0
while not TASK.empty():
try:
n = TASK.get(timeout=1)
r = parse_line(n)
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'running line %s, length is %s' % (C+1, r))
C += 1
RESULT.put([r, C])
except queue.Empty:
print('task queue is empty.')
RESULT.put(["done", "done"])
enter code here
print('worker exit')
The address 127.0.0.1 very specifically refers to the same computer where the code is running (in network terms: 127.0.0.1 is the IP address of localhost) .

Multiprocessing Queue.get() hangs

I'm trying to implement basic multiprocessing and I've run into an issue. The python script is attached below.
import time, sys, random, threading
from multiprocessing import Process
from Queue import Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue(10)
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
item = append_queue.get()
database.append(item)
print("Appended to database in %.4f seconds" % database.append_time)
append_queue.task_done()
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
append_queue.join()
#append_queue_process.join()
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
The AnalyzeFrequency analyzes the frequencies of words in a file and get_results() returns a sorted list of said words and frequencies. The list is very large, perhaps 10000 items.
This list is then passed to the add_to_append_queue method which adds it to a queue. The process_append_queue takes the items one by one and adds the frequencies to a "database". This operation takes a bit longer than the actual analysis in main() so I am trying to use a seperate process for this method. When I try and do this with the threading module, everything works perfectly fine, no errors. When I try and use Process, the script hangs at item = append_queue.get().
Could someone please explain what is happening here, and perhaps direct me toward a fix?
All answers appreciated!
UPDATE
The pickle error was my fault, it was just a typo. Now I am using the Queue class within multiprocessing but the append_queue.get() method still hangs.
NEW CODE
import time, sys, random
from multiprocessing import Process, Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue()
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
database.append(append_queue.get())
print("Appended to database in %.4f seconds" % database.append_time)
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
#append_queue.join()
#append_queue_process.join()
print str(append_queue.qsize())
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
UPDATE 2
This is the database code:
class FrequencyStore:
def __init__(self):
self.sorter = Sorter()
self.db = {}
self.load_time = -1
self.save_time = -1
self.append_time = -1
self.sort_time = -1
def load_db(self):
start_time = time.time()
try:
file = open("results.txt", 'r')
except:
raise IOError
self.db = {}
for line in file:
word, count = line.strip("\n").split("=")
self.db[word] = int(count)
file.close()
self.load_time = time.time() - start_time
def save_db(self):
start_time = time.time()
_db = []
for key in self.db:
_db.append([key, self.db[key]])
_db = self.sort(_db)
try:
file = open("results.txt", 'w')
except:
raise IOError
file.truncate(0)
for x in _db:
file.write(x[0] + "=" + str(x[1]) + "\n")
file.close()
self.save_time = time.time() - start_time
def create_sorted_db(self):
_temp_db = []
for key in self.db:
_temp_db.append([key, self.db[key]])
_temp_db = self.sort(_temp_db)
_temp_db.reverse()
return _temp_db
def get_db(self):
return self.db
def sort(self, _list):
start_time = time.time()
_list = self.sorter.mergesort(_list)
_list.reverse()
self.sort_time = time.time() - start_time
return _list
def append(self, _list):
start_time = time.time()
for x in _list:
if x[0] not in self.db:
self.db[x[0]] = x[1]
else:
self.db[x[0]] += x[1]
self.append_time = time.time() - start_time
Comments suggest you're trying to run this on Windows. As I said in a comment,
If you're running this on Windows, it can't work - Windows doesn't
have fork(), so each process gets its own Queue and they have nothing
to do with each other. The entire module is imported "from scratch" by
each process on Windows. You'll need to create the Queue in main(),
and pass it as an argument to the worker function.
Here's fleshing out what you need to do to make it portable, although I removed all the database stuff because it's irrelevant to the problems you've described so far. I also removed the daemon fiddling, because that's usually just a lazy way to avoid shutting down things cleanly, and often as not will come back to bite you later:
def process_append_queue(append_queue):
while True:
x = append_queue.get()
if x is None:
break
print("processed %d" % x)
print("worker done")
def main():
import multiprocessing as mp
append_queue = mp.Queue(10)
append_queue_process = mp.Process(target=process_append_queue, args=(append_queue,))
append_queue_process.start()
for i in range(100):
append_queue.put(i)
append_queue.put(None) # tell worker we're done
append_queue_process.join()
if __name__=="__main__":
main()
The output is the "obvious" stuff:
processed 0
processed 1
processed 2
processed 3
processed 4
...
processed 96
processed 97
processed 98
processed 99
worker done
Note: because Windows doesn't (can't) fork(), it's impossible for worker processes to inherit any Python object on Windows. Each process runs the entire program from its start. That's why your original program couldn't work: each process created its own Queue, wholly unrelated to the Queue in the other process. In the approach shown above, only the main process creates a Queue, and the main process passes it (as an argument) to the worker process.
queue.Queue is thread-safe, but doesn't work across processes. This is quite easy to fix, though. Instead of:
from multiprocessing import Process
from Queue import Queue
You want:
from multiprocessing import Process, Queue

python threads exits immediately

I am new to python.
I am trying out Hbase thrift client using thrift. I got some code on net, which I just modify to work with latest version of thrift but when I run the code , it just exit, no threads are started.
Here is the code.
import json, traceback, sys, datetime, time, logging, threading, random
import logging.handlers
import thrift
sys.path.append('gen-py')
from thrift.transport.TSocket import TSocket
from thrift.transport.TTransport import TBufferedTransport
from thrift.protocol import TBinaryProtocol
from hbase import THBaseService
gWritenItems = 0
gStartT = 0
gEndT = 0
recordsPerBatch = 300 #reports per client per day
columns = 3
#config
concurrent = 10
records = 60000#6000000 #6 million
bytesPerRecord = 1024
mylock = threading.RLock()
class writeThread(threading.Thread):
def __init__(self, threadname, RecordsThreadwillwrite):
threading.Thread.__init__(self, name = threadname)
bytesPerColumn = int(bytesPerRecord/columns) - 11 #suppose 3 columns
self.columnvalue = "value_" + "x"*bytesPerColumn + "_endv"
self.tbwBatch = int (RecordsThreadwillwrite / recordsPerBatch)
self.transport = TBufferedTransport(TSocket('pnq-adongrevm1', 5151), 40960)
self.transport.open()
protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
self.client = THBaseService.Client(protocol)
self.table = "example"
def run(self):
print "+%s start" % (self.getName())
global gEndT
global gWritenItems
threadWritenItem = 0
for loopidx in xrange(0, self.tbwBatch):
self.write_hbase() #write
threadWritenItem += recordsPerBatch
mylock.acquire()
gEndT = time.time()
gWritenItems += threadWritenItem
print "%s done, %s seconds past, %d reocrds saved" % (self.getName(), gEndT-gStartT, gWritenItems)
mylock.release()
self.transport.close()
def write_hbase(self): #write 50 rowkyes, and 3 column families in each rowkey
print self.getName(), "Start write"
batchmutations = []
for i in xrange(0, recordsPerBatch): # write to db, 300 items together
mutations = []
rowkey = "RK_%s_%s" % (random.random(), time.time())
for ii in xrange(0, columns):
mutations.append(THBaseService.TPut(row=rowkey, columnValues=[TColumnValue(family="f1", qualifier="%s"%ii, value=self.columnvalue)]))
self.client.putMultiple(self.table,mutations)
itemsPerThread = int(records / concurrent)
for threadid in xrange(0, concurrent):
gStartT = time.time()
t = writeThread("Thread_%s" % threadid, itemsPerThread)
t.start();
print "%d thread created, each thread will write %d records" % (concurrent, itemsPerThread)
I just get a message 10 thread created, each thread will write 6000 records
Yep, this is because you are not waiting for threads to finish their job, so the main thread just exits. Try this:
itemsPerThread = int(records / concurrent)
threads = []
for threadid in xrange(0, concurrent):
gStartT = time.time()
t = writeThread("Thread_%s" % threadid, itemsPerThread)
t.start();
threads.append(t)
# wait until all finish the job
for t in threads:
t.join()
EDIT Ha, I don't think I'm right here, because you didn't mark your threads as daemons. It should work even without joining. But have a look at this code:
class CustomThread(threading.Thread):
def run(self):
print "test"
for x in xrange(0, 10):
t = CustomThread()
t.start()
It will always reach print "test" line no matter what. So in your code it should always reach print "+%s start" % (self.getName()) no matter what. Are you sure it doesn't work? :)
If it doesn't, then there are only two possibilities:
There is a blocking operation and/or exception in your __init__ method. But then it would not reach final print;
concurrent variable is 0 for some reason (which is not consistent with the final print).

Categories

Resources