How to preserve file write order when using threading in python - python

I have some python code to read a file and push data to a list. Then put this list to queue, use threading to process the list, say 20 items a time. After processing, I save the result into a new file. What was put in the new file was actually different order than the original file. For example, I have in input,
1 a
2 b
3 c
4 a
5 d
But the output looks like:
2 aa
1 ba
4 aa
5 da
3 ca
Is there any way to preserve the original order?
Here is my code:
import threading,Queue,time,sys
class eSS(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue = queue
self.lock = threading.Lock()
def ess(self,email,code,suggested,comment,reason,dlx_score):
#do something
def run(self):
while True:
info = self.queue.get()
infolist = info.split('\t')
email = infolist[1]
code = infolist[2]
suggested = infolist[3]
comment = infolist[4]
reason = infolist[5]
dlx_score = (0 if infolist[6] == 'NULL' else int(infolist[6]))
g.write(info + '\t' + self.ess(email,code,suggested,comment,reason,dlx_score) +'\r\n')
self.queue.task_done()
if __name__ == "__main__":
queue = Queue.Queue()
filename = sys.argv[1]
#Define number of threads
threads = 20
f = open(filename,'r')
g = open(filename+'.eSS','w')
lines = f.read().splitlines()
f.close()
start = time.time()
for i in range(threads):
t = eSS(queue)
t.setDaemon(True)
t.start()
for line in lines:
queue.put(line)
queue.join()
print time.time()-start
g.close()

Three thoughts come to mind. Common to all is to include an index with the packet that is queued for processing.
One thought then is to use the controller/workers/output framework in which the output thread de-queues the worker-processed data, assembles, and outputs it.
The second thought is to employ a memory-mapped file for output, and use the index to calculate the offset to write into the file (assumes fixed-length writes probably).
The third is to use the index to put processed data in a new list, and when the list is completed write the items out at the end rather than on the fly.

Related

Issues getting Python Multiprocessing Library Working when using the Process Function

I'm trying to build a list of parent/comment pairs from the publicly available Reddit data set.
I have a CSV file which I load into a Pandas dataframe which contains rows of the comments with the parent and child id, as well as the child comment. The data is loaded using the following block of code:
import os
import multiprocessing as mp
import numpy as np
import pandas as pd
sourcePATH = r'C:\'
workingFILE = r'\output-pt1.csv'
# filepaths
input_file = sourcePATH + workingFILE
data_df = pd.read_csv(input_file,header=None,names=['PostIDX','ParentIDX','Comment','Score','Controversiality'])
The aim is to scan through each row in the dataframe and using the parent id to search through the rest of the dataframe to see if their is a parent comment present. If it is I then store the child and parent comments in a tuple with some other information. This will then be added to a list which will then be written out to a csv file at the end. To do this I use the following code:
def checkChildParent(ParentIDX_curr, ChildIDX_curr,ChildComment_curr,ChildScore_curr,ChildCont_curr):
idx = data_df.loc[data_df['PostIDX'] == ParentIDX_curr]
if idx.empty is False:
ParentComment = idx.iloc[0,2]
ParentScore = idx.iloc[0,3]
ParentCont = idx.iloc[0,4]
outPut.put([ParentIDX_curr[0], ParentComment,ParentScore,ParentCont,ChildIDX_curr[0], ChildComment_curr[0],ChildScore_curr[0],ChildCont_curr[0]])
if __name__ == '__main__':
print('Process started')
t_start_init = time.time()
t_start = time.time()
noCores = 1
#pool = mp.Pool(processes=noCores)
update_freq = 100
n = 1000
#n = round(len(data_df)/8)
flag_create = 0
flag_run = 0
i = 0
outPut = mp.Queue()
#parent_child_df = pd.DataFrame()
#parent_child_df.coumns = ['PostIDX','ParentIDX']
while i < n:
#print(i)
procs = []
ParentIDX = []
ParentComment = []
ParentScore = []
ParentCont = []
ChildIDX = []
ChildComment = []
ChildScore = []
ChildCont = []
for worker in range(0,noCores):
ParentIDX.append(data_df.iloc[i,1])
ChildIDX.append(data_df.iloc[i,0])
ChildComment.append(data_df.iloc[i,2])
ChildScore.append(data_df.iloc[i,3])
ChildCont.append(data_df.iloc[i,4])
i = i + 1
#when I call the function this way it returns the expected matches
#checkChildParent(ParentIDX,ChildIDX,ChildComment,
# ChildScore,ChildCont)
#when I call the function with Process function nothing appears to be happening
for proc in range(0,noCores):
p = mp.Process(target = checkChildParent, args=(ParentIDX[proc],ChildIDX[proc],ChildComment[proc],ChildScore[proc],ChildCont[proc]))
procs.append(p)
p.start()
#for p in procs:
# p.join()
if outPut.empty() is False:
print(outPut.get())
At the top of the file is a function which scans the dataframe for a given row and returns the tuple of the matched parent and child comment if it was found. If I call this function normally then it works fine, however when I call the function using the Process function it doesn't match anything!. I'm guessing its the form the arguments that are being passed to the function is being passed to the function that is causing the issue, but I have been trying to debug this all afternoon and have failed so far. If anyone has any suggestions then please let me know!
Thanks!

Multiprocessing hangs after several hundred jobs

I am trying to use this question for my file processing:
Python multiprocessing safely writing to a file
This is my modification of the code:
def listener(q):
'''listens for messages on the q, writes to file. '''
while 1:
reads = q.get()
if reads == 'kill':
#f.write('killed')
break
for read in reads:
out_bam.write(read)
out_bam.flush()
out_bam.close()
def fetch_reads(line, q):
parts = line[:-1].split('\t')
print(parts)
start,end = int(parts[1])-1,int(parts[2])-1
in_bam = pysam.AlignmentFile(args.bam, mode='rb')
fetched = in_bam.fetch(parts[0], start, end)
reads = [read for read in fetched if (read.cigarstring and read.pos >= start and read.pos < end and 'S' not in read.cigarstring)]
in_bam.close()
q.put(reads)
return reads
#must use Manager queue here, or will not work
manager = mp.Manager()
q = manager.Queue()
if not args.threads:
threads = 1
else:
threads = int(args.threads)
pool = mp.Pool(threads+1)
#put listener to work first
watcher = pool.apply_async(listener, (q,))
with open(args.bed,'r') as bed:
jobs = []
cnt = 0
for line in bed:
# Fire off the read fetchings
job = pool.apply_async(fetch_reads, (line, q))
jobs.append(job)
cnt += 1
if cnt > 10000:
break
# collect results from the workers through the pool result queue
for job in jobs:
job.get()
print('get')
#now we are done, kill the listener
q.put('kill')
pool.close()
The differences in that I am opening and closing the file in the function since otherwise I get unusual errors from bgzip.
At first, print(parts) and print('get') are interchangeably printed (more or less), then there are less and less prints of 'get'. Ultimately the code hangs, and nothing is printed (all the parts are printed, but 'get' simply doesn't print anymore). The output file remains zero bytes.
Can anyone lend a hand? Cheers!

Implementing multiprocessing in a loop scraper and appending the data

I am making a web scraper to build a database. The site I plan to use has index pages each containing 50 links. The amount of pages to be parsed is estimated to be around 60K and up, this is why I want to implement multiprocessing.
Here is some pseudo-code of what I want to do:
def harvester(index):
main=dict()
....
links = foo.findAll ( 'a')
for link in links:
main.append(worker(link))
# or maybe something like: map_async(worker(link))
def worker(url):
''' this function gather the data from the given url'''
return dictionary
Now what I want to do with that is to have a certain number of worker function to gather data in parallel on different pages. This data would then be appended to a big dictionary located in harvester or written directly in a csv file by the worker function.
I'm wondering how I can implement parallelism. I have done a faire
amount of research on using gevent, threading and multiprocessing but
I am not sure how to implement it.
I am also not sure if appending data to a large dictionary or writing
directly in a csv using DictWriter will be stable with that many input at the same time.
Thanks
I propose you to split your work into separate workers which communicate via Queues.
Here you mostly have IO wait time (crawling, csv writing)
So you can do the following (not tested, just see the idea):
import threading
import Queue
class CsvWriter(threading.Thread):
def __init__(self, resultq):
super(CsvWriter, self).__init__()
self.resultq = resultq
self.writer = csv.DictWriter(open('results.csv', 'wb'))
def run(self):
done = False
while not done:
row = self.requltq.get()
if row != -1:
self.writer.writerow(row)
else:
done = True
class Crawler(threading.Thread):
def __init__(self, inputqueue, resultq):
super(Crawler, self).__init__()
self.iq = inputq
self.oq = resultq
def run(self):
done = False
while not done:
link = self.iq.get()
if link != -1:
result = self.extract_data(link)
self.oq.put(result)
else:
done = True
def extract_data(self, link):
# crawl and extract what you need and return a dict
pass
def main():
linkq = Queue.Queue()
for url in your_urls:
linkq.put(url)
resultq = Queue.Queue()
writer = CsvWriter(resultq)
writer.start()
crawlers = [Crawler(linkq, resultq) for _ in xrange(10)]
[c.start() for c in crawlers]
[linkq.put(-1) for _ in crawlers]
[c.join() for c in crawlers]
resultq.put(-1)
writer.join()
This code should work (fix possible typos) and make it to exit when all the urls are finished

multiprocessing pipe poll hangs

I have 2 processes, one is the main process and one is a data generator.
I´m trying to generate data quite fast, every 100 usecs a pair of values. Since the main process has more stuff to do, the main process is telling the Generator process via a simple multiprocess pipe that it is now ready to receive a new data package.
Unfortunately the statuspipe.poll() statement in the generator process hangs in unregular periods for a few milliseconds. Why is that ?
Parent process which sets up the data and the statuspipe:
from multiprocessing import Process,Pipe
self.pipe1_parent, self.pipe1_child = Pipe(duplex=False)
self.statuspipe_parent, self.statuspipe_child = Pipe(duplex=False)
self.process1 = Process(target=data_generator.generate_values, args=(self.data_generator,self.pipe1_child,self.statuspipe_parent))
self.process1.start()
Child process which generates the data
def generate_values(self,pipe1,statuspipe):
tmpcount = 0
self.data = np.empty((self.blocksize + 1,2), dtype=np.uint16)
while 1:
ValueGenerator.next(self)
#can the data pipe be filled with next dataset ?
if statuspipe.poll():
self.data[tmpcount,0] = tmpcount
self.data[tmpcount,1] = self.timestamp
pipe1.send(self.data[0:tmpcount])
tmpcount = 0
msg = statuspipe.recv()
else:
self.data[tmpcount,0] = tmpcount
self.data[tmpcount,1] = self.timestamp
tmpcount = tmpcount + 1
I´ve already tried to set up the StatusPipe with Duplex = True and False but I do not see much a difference.

Python multi-threading missing jobs

If I run the script step by step works perfectly, but when I'm using threading misses 50-60%. I'm using Python + mechanize module
#setting up the browser
mySite = 'http://example.com/managament.php?'
postData = {'UserID' : '', 'Action':'Delete'}
job_tab1_user1 = [1,2,3]
job_tab2_user1 = [4,5,6]
job_tab1_user2 = [7,8,9]
job_tab2_user2 = [10,12,13]
.... till user1000
#i want to point out that the lists are 100% different
def user1_jobs:
for i in job_tab1_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
def user2_jobs:
for i in job_tab1_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
... and so on till user 1000
And I call them in the end like this:
t_user1 = threading.Thread(target=user1_jobs, args=[])
t_user1.start()
t_user2 = threading.Thread(target=user2_jobs, args=[])
t_user2.start()
I have a similar script that sends like 200 request per second and all of them are processed. I also tried using time.sleep(2), but again is missing a lot.
Another question besides what is wrong with my script is if its way to compact this code, because I'm using 1000 users and the script reaches thousands of lines. Thank you in advance.
from threading import *
submits = [[1,2,3], [3,4,5], [6,7,8]]
class worker(Thread):
def __init__(self, site, postdata, data):
Thread.__init__(self)
self.data = data
self.site = site
self.postdata = postdata
self.start()
def run(self):
for i in self.data:
browser.open("http://example.com/jobs.php?actions="+str(i))
browser.open(self.site, self.postdata)
for obj in submits:
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, submits)
Since the OP asked for it, here's a condensed/compressed version of the code.
or:
for index in range(0,1000):
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, [i for i in range(1,4)])
If the data you want to send actually is a sequence of 3 integers (1,2,3) that inclines in a perfect order.
Here is a full script that you can easily modify by changing the initial variables.
It creates a list dynamically and uses a generator to create the functions for each thread.
Currently it creates 1000 users, each with 2 tabs and 3 jobs.
# define your variables here
NUM_USERS = 1000
NUM_JOBS_PER_USER = 3
NUM_TABS_PER_USER = 2
URL_PART = "http://example.com/jobs.php?actions="
# populate our list of jobs
# the structure is like this: jobs[user][tab][job]
jobs = [[[0 for y in range(NUM_JOBS_PER_USER)] \
for x in range(NUM_TABS_PER_USER)] \
for x in range(NUM_USERS)]
p = 1
for i in range(NUM_USERS):
for j in range(NUM_TABS_PER_USER):
for k in range(NUM_JOBS_PER_USER):
jobs[i][j][k] = p
p += 1
# create a generator that builds our thread functions
def generateFunctions(jobs):
for user in jobs:
for tab in user:
for job in tab:
def f():
browser.open(URL_PART + str(job))
browser.open(mySite, Post_data)
yield f
# create and start threads, add them to a list
# if we need to preserve handlers for later use
threads = []
for f in generateFunctions(jobs):
thr = threading.Thread(target = f, args=[])
thr.start()
threads.append(thr)

Categories

Resources