How to open file for each Python thread? - python

I have six threads
class ConsumerThread(Thread):
def __init__(self,queue):
super(ConsumerThread, self).__init__()
self.queue = queue
def run(self):
item = queue.get()
parser = MyHTMLParser()
new_con = parser.feed(item)
print(new_con)
queue.task_done()
I want to put new_con into file, with single file for one thread.

class ConsumerThread(Thread):
def __init__(self,queue):
super(ConsumerThread, self).__init__()
self.queue = queue
def run(self):
item = queue.get()
parser = MyHTMLParser()
new_con = parser.feed(item)
with open('file_name-%s' % self.getName(),
mode='w', encoding='utf8') as f:
print(new_con, file=f)
queue.task_done()
This will produce files with names like file_name-Thread-1 etc. You might want to use something nicer than this, but as a first shot, this solves your issues.

Related

Queues or Dictionaries for shared ressources in threads

I have two threads with while loops in them. The first process data that the second needs to elaborate in parallel. I need to share a variable.
let's introduce dummy input:
data = iter([1,2,3,4,5,6,7,8,9])
My first class of Thread:
import threading
from queue import Queue
import time
class Thread1(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
_download = {}
def run(self):
i = 0
while True:
_download[i] = next(data)
self.queue.put(next(data))
time.sleep(1)
i += 1
My second class of Thread:
class Thread2(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
self.queue.get()
time.sleep(3)
with the main method:
q = Queue(maxsize=10)
t = Thread1(q)
s = Thread2(q)
t.start()
s.start()
I illustratedthe two alternatives for the case. I can access queue variable from the second Thread but I also want that the second Thread access the dictionary.
what can I do to access also the dictionary from Thread2?
for which choice should I opt?

Avoid waiting for threads to finish in Python

I've wrote this script here to read data from a txt file and process it. But it seems that if I give it a big file and a high number of threads, the more it reads from the list, the slower the script gets.
Is there a way to avoid waiting for all the threads to finish and start a new one whenever a thread is done with the work?
Also it seems that when it finishes processing, the script doesn't exit.
import threading, Queue, time
class Work(threading.Thread):
def __init__(self, jobs):
threading.Thread.__init__(self)
self.Lock = threading.Lock()
self.jobs = jobs
def myFunction(self):
#simulate work
self.Lock.acquire()
print("Firstname: "+ self.firstname + " Lastname: "+ self.lastname)
self.Lock.release()
time.sleep(3)
def run(self):
while True:
self.item = self.jobs.get().rstrip()
self.firstname = self.item.split(":")[0]
self.lastname = self.item.split(":")[1]
self.myFunction()
self.jobs.task_done()
def main(file):
jobs = Queue.Queue()
myList = open(file, "r").readlines()
MAX_THREADS = 10
pool = [Work(jobs) for i in range(MAX_THREADS)]
for thread in pool:
thread.start()
for item in myList:
jobs.put(item)
for thread in pool:
thread.join()
if __name__ == '__main__':
main('list.txt')
The script probably seems to take longer on larger inputs because there's a 3 second pause between each batch of printing.
The issue with the script not finishing is, since you are using Queue, you need to call join() on the Queue, not on the individual threads. To make sure that the script returns when the jobs have stopped running, you should also set daemon = True.
The Lock will also not work in the current code because threading.Lock() produces a new lock each time. You need to have all the jobs share the same lock.
If you want to use this in Python 3 (which you should), the Queue module has been renamed to queue.
import threading, Queue, time
lock = threading.Lock() # One lock
class Work(threading.Thread):
def __init__(self, jobs):
threading.Thread.__init__(self)
self.daemon = True # set daemon
self.jobs = jobs
def myFunction(self):
#simulate work
lock.acquire() # All jobs share the one lock
print("Firstname: "+ self.firstname + " Lastname: "+ self.lastname)
self.Lock.release()
time.sleep(3)
def run(self):
while True:
self.item = self.jobs.get().rstrip()
self.firstname = self.item.split(":")[0]
self.lastname = self.item.split(":")[1]
self.myFunction()
self.jobs.task_done()
def main(file):
jobs = Queue.Queue()
with open(file, 'r') as fp: # Close the file when we're done
myList = fp.readlines()
MAX_THREADS = 10
pool = [Work(jobs) for i in range(MAX_THREADS)]
for thread in pool:
thread.start()
for item in myList:
jobs.put(item)
jobs.join() # Join the Queue
if __name__ == '__main__':
main('list.txt')
Simpler example (based on an example from the Python docs)
import threading
import time
from Queue import Queue # Py2
# from queue import Queue # Py3
lock = threading.Lock()
def worker():
while True:
item = jobs.get()
if item is None:
break
firstname, lastname = item.split(':')
lock.acquire()
print("Firstname: " + firstname + " Lastname: " + lastname)
lock.release()
time.sleep(3)
jobs.task_done()
jobs = Queue()
pool = []
MAX_THREADS = 10
for i in range(MAX_THREADS):
thread = threading.Thread(target=worker)
thread.start()
pool.append(thread)
with open('list.txt') as fp:
for line in fp:
jobs.put(line.rstrip())
# block until all tasks are done
jobs.join()
# stop workers
for i in range(MAX_THREADS):
jobs.put(None)
for thread in pool:
thread.join()

Python refactoring with thread and queue

I was trying to restructure my code,first version is here
What I want is to run two objects concurrently
from queue import Queue
from threading import Thread
from html.parser import HTMLParser
import urllib.request
NUMBER_OF_THREADS = 3
HOSTS = ["http://yahoo.com", "http://google.com", "http://ibm.com"]
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print("Start tag:", tag)
for attr in attrs:
print("\tattr:", attr)
class ProducerThread(Thread):
def __init__(self,queue):
super(ProducerThread, self).__init__()
self.queue = queue
def run(self):
while True:
for host in HOSTS:
url = urllib.request.urlopen(host)
content = str(url.read(4096))
queue.put(content)
class ConsumerThread(Thread):
def __init__(self,queue):
super(ConsumerThread, self).__init__()
self.queue = queue
def run(self):
while True:
item = queue.get()
parser = MyHTMLParser()
new_con = parser.feed(item)
print(new_con)
queue.task_done()
if __name__ == '__main__':
queue = Queue()
p = ProducerThread(queue)
c = ConsumerThread(queue)
p.start()
c.start()
When I run code from terminal there is no output.What should I change?
Unindent the run methods so that they are not inside the __init__ methods.
Note however you almost certainly don't want those to loop forever; remove the while True.

How to I parse 1500 mills lines of data efficiently?

I have 1,500,000,000 rows of data saved into multiple txt files. The data formatted as following:
key1 key2
Where key1 is url, and key2 is mysql record row_id.
I wrote following python code to parse the data, but it is slow.
e.g.
import Queue
import threading
class CheckThread(threading.Thread):
def __init__(self, queue, src_folder, dest_folder='check_result'):
super(CheckThread, self).__init__()
self._queue = queue
self.daemon = True
def run(self):
while True:
file_name = self._queue.get()
try:
self._prepare_check(file_name)
except:
self._queue.task_done()
continue
self._queue.task_done()
def Check(src_folder, workers=12, dest_folder='check_result'):
queue = Queue.Queue()
for (dirpath, dirnames, filelist) in os.walk(src_folder):
for name in filelist:
if name[0] == '.':
continue
queue.put(os.path.join(dirpath, name))
for worker in xrange(workers):
worker = str(worker + 1)
t = CheckThread(queue, src_folder, dest_folder)
t.start()
queue.join()
def main(folder, worker=12, out='check_result'):
try:
Check(folder, worker, out)
except:
return 1
return 0
Each thread parse a one file from the queue.
How do I improve the parsing speed of each file.
Some suggestions:
Returning 1 on error and 0 on success is not pythonic.
Never use except:, always specify which exceptions you want to catch.
Your first try: ... except: ... is not the propper structure to use here, you should use a try: ... finally: ... that executes the finally: part even if the try: part raises an exception.
Some of the parameters of CheckThread.__init__() are not being used.
What does CheckThread._prepare_check() do?
You are not using worker.
The changed code would be:
import Queue
import threading
class CheckThread(threading.Thread):
def __init__(self, queue, src_folder, dest_folder='check_result'):
super(CheckThread, self).__init__()
self._queue = queue
self.daemon = True
# Do something with src_folder and dest_folder or delete them from the parameter list
def run(self):
while True:
file_name = self._queue.get()
try:
self._prepare_check(file_name)
finally:
self._queue.task_done()
def Check(src_folder, workers=12, dest_folder='check_result'):
queue = Queue.Queue()
for (dirpath, dirnames, filelist) in os.walk(src_folder):
for name in filelist:
if name[0] == '.':
continue
queue.put(os.path.join(dirpath, name))
for worker in xrange(workers):
worker = str(worker + 1) # Do something with worker or delete this line
t = CheckThread(queue, src_folder, dest_folder)
t.start()
queue.join()
def main(folder, worker=12, out='check_result'):
Check(folder, worker, out)

Run Class methods in threads (python)

I'm currently learning Python and Classes and I have a basic question, but I didn't find any answer to it. Let's say I have this dummy class
class DomainOperations:
def __init__(self, domain):
self.domain = domain
self.domain_ip = ''
self.website_thumbnail = ''
def resolve_domain(self):
#resolve domain to ipv4 and save to self.domain_ip
def generate_website_thumbnail(self):
#generate website thumbnail and save the url to self.website_thumbnail
I want to run simultaneously resolve_domain and generate_website_thumbnail and when the threads are finished I want to print the IP and the thumbnail.
EDIT: I know I should use threads, maybe something like this
r = DomainOperations('google.com')
t1 = threading.Thread(target=r.resolve_domain)
t1.start()
t2 = threading.Thread(target=r.generate_website_thumbnail)
t2.start()
But should I use them outside the Class? Should I write another Class to handle Threads?
What is the right way to do that?
If you call them from the class, it is as simple as:
import threading
class DomainOperations:
def __init__(self):
self.domain_ip = ''
self.website_thumbnail = ''
def resolve_domain(self):
self.domain_ip = 'foo'
def generate_website_thumbnail(self):
self.website_thumbnail= 'bar'
def run(self):
t1 = threading.Thread(target=self.resolve_domain)
t2 = threading.Thread(target=self.generate_website_thumbnail)
t1.start()
t2.start()
t1.join()
t2.join()
print(self.domain_ip, self.website_thumbnail)
if __name__ == '__main__':
d = DomainOperations()
d.run()
You can inherit Thread class in DomainOperation, in this way code would be more clean and easily understandable. you have to override a run() method.
from threading import Thread
class DomainOperations(Thread):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.domain_ip = ''
self.website_thumbnail = ''
def resolve_domain(self):
self.domain_ip = 'foo'
def generate_website_thumbnail(self):
self.website_thumbnail= 'bar'
def run(self):
#domain will be resolved on first thread
self.resolve_domain()
#thumbnail will be resolved on second OR newly created below thread
thread2 = Thread(target=self.generate_website_thumbnail)
thread.start()
# thread1 will wait for thread2
self.join()
# thread2 will wait for thread1, if it's late.
thread2.join()
# here it will print ip and thumbnail before exiting first thread
print(self.domain_ip, self.website_thumbnail)
And you will start your threads in this way.
if __name__ == '__main__':
thread1 = DomainOperations()
thread1.start()
def post_test(tbid, line_num, response_time):
"""
:param tbid: 参数id
:return:
"""
# 请求参数
data = {'tbId': tbid, 'conditions': [{"key": "", "type": 1}], 'pageNum': 1, 'pageSize': 12}
# 请求启动时间
start = time.time()
# post请求
r = requests.post(url=url, data=json.dumps(data), headers=headers)
# 请求结束时间
end = time.time()
# 保留两位小数
finall_time = float('%.2f' % float(end - start))
text = json.loads(r.text)
# IO写入 只写入200的
with open('text6.csv', 'a', newline='') as csvfile:
if text['statusCode'] == '200':
throughput = line_num * response_time / finall_time
throughput = float('%.2f' % float(throughput))
print('the perf_counter time of %s is %s and the content is %s ,throughput is %s' % (
tbid, finall_time, json.loads(r.text), throughput))
spamwriter = csv.writer(csvfile, dialect='excel')
spamwriter.writerow([tbid] + [finall_time] + [throughput])
def start_thread(csv_name):
tbid, response_time_sort, throughput_sort = read_csv(csv_name)
print(tbid)
line_num = len(tbid)
response_times = 5
for j in range(response_times):
for i in tbid:
t = threading.Thread(target=post_test, args=(i, line_num, response_times))
t.start()
t.join()
I don't know how to call a method in a class, especially if it has initialization parameters, but you can try this method。 I'm trying to use multiple processes to solve this problem, right。

Categories

Resources