I'm running a Python script on a Sun Grid Engine supercompute cluster that reads in a list of file ids, sends each to a worker process for analysis, and writes one output per input file to disk.
The trouble is I'm getting IOError(110, 'Connection timed out') somewhere inside the worker function, and I'm not sure why. I've received this error in the past when making network requests that were severely delayed, but in this case the worker is only trying to read data from disk.
My question is: What would cause a Connection timed out error when reading from disk, and how can one resolve this error? Any help others can offer would be very appreciated.
Full script (the IOError crops up in minhash_text()):
from datasketch import MinHash
from multiprocessing import Pool
from collections import defaultdict
from nltk import ngrams
import json
import sys
import codecs
import config
cores = 24
window_len = 12
step = 4
worker_files = 50
permutations = 256
hashband_len = 4
def minhash_text(args):
'''Return a list of hashband strings for an input doc'''
try:
file_id, path = args
with codecs.open(path, 'r', 'utf8') as f:
f = f.read()
all_hashbands = []
for window_idx, window in enumerate(ngrams(f.split(), window_len)):
window_hashbands = []
if window_idx % step != 0:
continue
minhash = MinHash(num_perm=permutations, seed=1)
for ngram in set(ngrams(' '.join(window), 3)):
minhash.update( ''.join(ngram).encode('utf8') )
hashband_vals = []
for i in minhash.hashvalues:
hashband_vals.append(i)
if len(hashband_vals) == hashband_len:
window_hashbands.append( '.'.join([str(j) for j in hashband_vals]) )
hashband_vals = []
all_hashbands.append(window_hashbands)
return {'file_id': file_id, 'hashbands': all_hashbands}
except Exception as exc:
print(' ! error occurred while processing', file_id, exc)
return {'file_id': file_id, 'hashbands': []}
if __name__ == '__main__':
file_ids = json.load(open('file_ids.json'))
file_id_path_tuples = [(file_id, path) for file_id, path in file_ids.items()]
worker_id = int(sys.argv[1])
worker_ids = list(ngrams(file_id_path_tuples, worker_files))[worker_id]
hashband_to_ids = defaultdict(list)
pool = Pool(cores)
for idx, result in enumerate(pool.imap(minhash_text, worker_ids)):
print(' * processed', idx, 'results')
file_id = result['file_id']
hashbands = result['hashbands']
for window_idx, window_hashbands in enumerate(hashbands):
for hashband in window_hashbands:
hashband_to_ids[hashband].append(file_id + '.' + str(window_idx))
with open(config.out_dir + 'minhashes-' + str(worker_id) + '.json', 'w') as out:
json.dump(dict(hashband_to_ids), out)
It turned out I was hammering the filesystem too hard, making too many concurrent read requests for files on the same server. That server could only allow a fixed number of reads in a given period, so any requests over that limit received a Connection Timed Out response.
The solution was to wrap each file read request in a while loop. Inside that while loop, try to read the appropriate file from disk. If the Connection timed out error springs, sleep for a second and try again. Only once the file has been read may the while loop be broken.
Related
I can't wrap my head around how I could possibly rewrite my code to be multi-threaded.
The code I'm writing is made to automatically archive every single article in a list of newsgroups that exist, but I wanna be able to utilize my newsgroup plan and make it up to 20 threads. I've never coded threading before and my attempts were in vein.
Here's my code, excluding the username and pass ( but you can get a free account with max 5 threads if you really want to at https://my.xsusenet.com )
Please don't judge me too hard :(
import nntplib
import sys
import datetime
import os
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
resp, count, first, last, name = s.group(group)
for message_id in range(first, last):
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {message_id}")
outfile = open('.\\' + group + '\\' + str(message_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
Tried threading using a ThreadPoolExecutor, to cause it to use 20 threads, and failed, caused it to repeat the same process to the same message id. The expected result was to download 20 different messages at a time.
Here's the code I tried with threading, mind you I did like 6-8 variations of it to try and get it to work, this was the last one before I gave up to ask on here.
import nntplib
import sys
import datetime
import os
import concurrent.futures
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
def download_nntp_file(mess_id):
resp, count, first, last, name = s.group(group)
message_id = range(first, last)
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {mess_id}")
outfile = open('.\\' + group + '\\' + str(mess_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = executor.submit(download_nntp_file)
I can't test it with XSUseNet.
I wouldn't use global variables because when processes work at the same time then they may get the same values from these variables.
You should rather send values as parameters to functions.
Something like this:
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
for g_tuple in groups_list_tuple:
executor.submit(download_nntp_file, g_tuple)
But I would be simpler to use map() instead of submit() because it gets list with arguments and it doesn't need for-loop
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_nntp_file, groups_list_tuple)
I am trying to write a python script scanning a folder and collect updated SQL script, and then automatically pull data for the SQL script. In the code, a while loop is scanning new SQL file, and send to data pull function. I am having trouble to understand how to make a dynamic queue with while loop, but also have multiprocess to run the tasks in the queue.
The following code has a problem that the while loop iteration will work on a long job before it moves to next iteration and collects other jobs to fill the vacant processor.
Update:
Thanks to #pbacterio for catching the bug, and now the error message is gone. After changing the code, the python code can take all the job scripts during one iteration, and distribute the scripts to four processors. However, it will get hang by a long job to go to next iteration, scanning and submitting the newly added job scripts. Any idea how to reconstruct the code?
I finally figured out the solution see answer below. It turned out what I was looking for is
the_queue = Queue()
the_pool = Pool(4, worker_main,(the_queue,))
For those stumble on the similar idea, following is the whole architecture of this automation script converting a shared drive to a 'server for SQL pulling' or any other job queue 'server'.
a. The python script auto_data_pull.py as shown in the answer. You need to add your own job function.
b. A 'batch script' with following:
start C:\Anaconda2\python.exe C:\Users\bin\auto_data_pull.py
c. Add a task triggered by start computer, run the 'batch script'
That's all. It works.
Python Code:
from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support
#
# Function run by worker processes
#
def worker(input, output):
for func, args in iter(input.get, 'STOP'):
result = compute(func, args)
output.put(result)
#
# Function used to compute result
#
def compute(func, args):
result = func(args)
return '%s says that %s%s = %s' % \
(current_process().name, func.__name__, args, result)
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
return files_dict
##### working in single thread
def single_thread():
path = "Y:/"
before = check_files(path)
sql_queue = []
while True:
time.sleep(3)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
# print sql_queue
for sql_file in sql_queue:
try:
query_sql(sql_file)
except:
pass
##### not working in queue
def multiple_thread():
NUMBER_OF_PROCESSES = 4
path = "Y:/"
sql_queue = []
before = check_files(path) # get the current dictionary of sql_files
task_queue = Queue()
done_queue = Queue()
while True: #while loop to check the changes of the files
time.sleep(5)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
# Create queues
#submit task
for task in TASKS:
task_queue.put(task)
for i in range(NUMBER_OF_PROCESSES):
p = Process(target=worker, args=(task_queue, done_queue)).start()
# try:
# p = Process(target=worker, args=(task_queue))
# p.start()
# except:
# pass
# Get and print results
print 'Unordered results:'
for i in range(len(TASKS)):
print '\t', done_queue.get()
# Tell child processes to stop
for i in range(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
# single_thread()
if __name__ == '__main__':
# freeze_support()
multiple_thread()
Reference:
monitor file changes with python script: http://timgolden.me.uk/python/win32_how_do_i/watch_directory_for_changes.html
Multiprocessing:
https://docs.python.org/2/library/multiprocessing.html
Where did you define sql_file in multiple_thread() in
multiprocessing.Process(target=query_sql, args=(sql_file)).start()
You have not defined sql_file in the method and moreover you have used that variable in a for loop. The variable's scope is only confined to the for loop.
Try replacing this:
result = func(*args)
by this:
result = func(args)
I have figured this out. Thank your for the response inspired the thought.
Now the script can run a while loop to monitor the folder for new updated/added SQL script, and then distribute the data pulling to multiple threads. The solution comes from the queue.get(), and queue.put(). I assume the queue object takes care of the communication by itself.
This is the final code --
from glob import glob
import os, time
import sys
import pypyodbc
from multiprocessing import Process, Queue, Event, Pool, current_process, freeze_support
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
try:
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
except:
pass
return files_dict
def worker_main(queue):
print os.getpid(),"working"
while True:
item = queue.get(True)
query_sql(item)
def main():
the_queue = Queue()
the_pool = Pool(4, worker_main,(the_queue,))
path = "Y:/"
before = check_files(path) # get the current dictionary of sql_files
while True: #while loop to check the changes of the files
time.sleep(5)
sql_queue = []
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
if sql_queue:
for jsl_file in sql_queue:
try:
the_queue.put(jsl_file)
except:
print "{0} failed with error {1}. \n".format(jsl_file, str(sys.exc_info()[0]))
pass
else:
pass
if __name__ == "__main__":
main()
I have a non-seekable file-like object. In particular it is a file of indeterminate size coming from an HTTP request.
import requests
fileobj = requests.get(url, stream=True)
I am streaming this file to a call to an Amazon AWS SDK function which is writing the contents to Amazon S3. This is working fine.
import boto3
s3 = boto3.resource('s3')
s3.bucket('my-bucket').upload_fileobj(fileobj, 'target-file-name')
However, at the same time as streaming it to S3 I want to also stream the data to another process. This other process may not need the entire stream and might stop listening at some point; this is fine and should not affect the stream to S3.
It's important I don't use too much memory, since some of these files could be enormous. I don't want to write anything to disk for the same reason.
I don't mind if either sink is slowed down due to the other being slow, as long as S3 eventually gets the entire file, and the data goes to both sinks (rather, to each one which still wants it).
What's the best way to go about this in Python (3)? I know I can't just pass the same file object to both sinks, such as
s3.bucket('my-bucket').upload_fileobj(fileobj, 'target-file-name')
# At the same time somehow as
process = subprocess.Popen(['myapp'], stdin=fileobj)
I think I could write a wrapper for the file-like object which passes any data read not only to the caller (which would be the S3 sink) but also to the other process. Something like
class MyFilewrapper(object):
def __init__(self, fileobj):
self._fileobj = fileobj
self._process = subprocess.Popen(['myapp'], stdin=popen.PIPE)
def read(self, size=-1):
data = self._fileobj.read(size)
self._process.stdin.write(data)
return data
filewrapper = MyFilewrapper(fileobj)
s3.bucket('my-bucket').upload_fileobj(filewrapper, 'target-file-name')
But is there a better way to do it? Perhaps something like
streams = StreamDuplicator(fileobj, streams=2)
s3.bucket('my-bucket').upload_fileobj(streams[0], 'target-file-name')
# At the same time somehow as
process = subprocess.Popen(['myapp'], stdin=streams[1])
The discomfort regarding your MyFilewrapper solution arises, because the IO loop inside upload_fileobj is now in control of feeding the data to a subprocess that is strictly speaking unrelated to the upload.
A "proper" solution would involve an upload API that provides a file-like object for writing the upload stream with an outside loop. That would allow you to feed the data to both target streams "cleanly".
The following example shows the basic concept. The fictional startupload method provides the file-like object for uploading. Of cource you would need to add proper error handling etc.
fileobj = requests.get(url, stream=True)
upload_fd = s3.bucket('my-bucket').startupload('target-file-name')
other_fd = ... # Popen or whatever
buf = memoryview(bytearray(4046))
while True:
r = fileobj.read_into(buf)
if r == 0:
break
read_slice = buf[:r]
upload_fd.write(read_slice)
other_fd.write(read_slice)
Here is an implementation of StreamDuplicator with requested functionality and use model. I verified that it handles correctly the case when one of the sinks stops consuming the respective stream half-way.
Usage:
./streamduplicator.py <sink1_command> <sink2_command> ...
Example:
$ seq 100000 | ./streamduplicator.py "sed -n '/0000/ {s/^/sed: /;p}'" "grep 1234"
Output:
sed: 10000
1234
11234
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
21234
sed: 20000
31234
sed: 30000
41234
sed: 40000
51234
sed: 50000
61234
sed: 60000
71234
sed: 70000
81234
sed: 80000
91234
sed: 90000
sed: 100000
streamduplicator.py:
#!/usr/bin/env python3
import sys
import os
from subprocess import Popen
from threading import Thread
from time import sleep
import shlex
import fcntl
WRITE_TIMEOUT=0.1
def write_or_timeout(stream, data, timeout):
data_to_write = data[:]
time_to_sleep = 1e-6
time_remaining = 1.0 * timeout
while time_to_sleep != 0:
try:
stream.write(data_to_write)
return True
except BlockingIOError as ex:
data_to_write = data_to_write[ex.characters_written:]
if ex.characters_written == 0:
time_to_sleep *= 2
else:
time_to_sleep = 1e-6
time_remaining = timeout
time_to_sleep = min(time_remaining, time_to_sleep)
sleep(time_to_sleep)
time_remaining -= time_to_sleep
return False
class StreamDuplicator(object):
def __init__(self, stream, n, timeout=WRITE_TIMEOUT):
self.stream = stream
self.write_timeout = timeout
self.pipereadstreams = []
self.pipewritestreams = []
for i in range(n):
(r, w) = os.pipe()
readStream = open(r, 'rb')
self.pipereadstreams.append(readStream)
old_flags = fcntl.fcntl(w, fcntl.F_GETFL);
fcntl.fcntl(w, fcntl.F_SETFL, old_flags|os.O_NONBLOCK)
self.pipewritestreams.append(os.fdopen(w, 'wb'))
Thread(target=self).start()
def __call__(self):
while True:
data = self.stream.read(1024*16)
if len(data) == 0:
break
surviving_pipes = []
for p in self.pipewritestreams:
if write_or_timeout(p, data, self.write_timeout) == True:
surviving_pipes.append(p)
self.pipewritestreams = surviving_pipes
def __getitem__(self, i):
return self.pipereadstreams[i]
if __name__ == '__main__':
n = len(sys.argv)
streams = StreamDuplicator(sys.stdin.buffer, n-1, 3)
for (i,cmd) in zip(range(n-1), sys.argv[1:]):
Popen(shlex.split(cmd), stdin=streams[i])
Implementation limitations:
usage of fcntl to set a pipe writing file descriptor to non-blocking mode probably makes it unusable under Windows.
a closed/unsubscribed sink is detected through a write timeout.
I'm currently writing a script that reads reddit comments from a large file (5 gigs compressed, ~30 gigs of data being read). My script reads the comments, checks for some text, parses them, and sends them off to a Queue function (running in a seperate thread). No matter what I do, I always get a MemoryError on a specific iteration (number 8162735 if it matters in the slightest). And I can't seem to handle the error, Windows just keeps shutting down python when it hits. Here's my script:
import ujson
from tqdm import tqdm
import bz2
import json
import threading
import spacy
import Queue
import time
nlp = spacy.load('en')
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in (enumerate(file_)):
yield ujson.loads(line)['body']
objects = iter_comments('RC_2015-01.bz2')
q = Queue.Queue()
f = open("reddit_dump.bin", 'wb')
def worker():
while True:
item = q.get()
f.write(item)
q.task_done()
for i in range(0, 2):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
def finish_parse(comment):
global q
try:
comment_parse = nlp(unicode(comment))
comment_bytes = comment_parse.to_bytes()
q.put(comment_bytes)
except MemoryError:
print "MemoryError with comment {0}, waiting for Queue to empty".format(comment)
time.sleep(2)
except AssertionError:
print "AssertionError with comment {0}, skipping".format(comment)
for comment in tqdm(objects):
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
q.join()
f.close()
Does anybody know what I'm doing wrong?
Looks like its not in your code but may be in the data. Have you tried to skip that iteration?
x = 0
for comment in tqdm(objects):
x += 1
if x != 8162735
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
Consider the following program:
#!/usr/bin/env pypy
import json
import cStringIO
import sys
def main():
BUFSIZE = 10240
f = sys.stdin
decoder = json.JSONDecoder()
io = cStringIO.StringIO()
do_continue = True
while True:
read = f.read(BUFSIZE)
if len(read) < BUFSIZE:
do_continue = False
io.write(read)
try:
data, offset = decoder.raw_decode(io.getvalue())
print(data)
rest = io.getvalue()[offset:]
if rest.startswith('\n'):
rest = rest[1:]
decoder = json.JSONDecoder()
io = cStringIO.StringIO()
io.write(rest)
except ValueError, e:
#print(e)
#print(repr(io.getvalue()))
continue
if not do_continue:
break
if __name__ == '__main__':
main()
And here's a test case:
$ yes '{}' | pv | pypy parser-test.py >/dev/null
As you can see, the following script slows down when you add more input to it. This also happens with cPython. I tried to profile the script using mprof and cProfile, but I found no hint on why is that. Does anybody have a clue?
Apparently the string operations slowed it down. Instead of:
data, offset = decoder.raw_decode(io.getvalue())
print(data)
rest = io.getvalue()[offset:]
if rest.startswith('\n'):
rest = rest[1:]
It is better to do:
data, offset = decoder.raw_decode(io.read())
print(data)
rest = io.getvalue()[offset:]
io.truncate()
io.write(rest)
if rest.startswith('\n'):
io.seek(1)
You may want to close your StringIO at the end of the iteration (after writing).
io.close()
The memory buffer for a StringIO will free once it is closed, but will stay open otherwise. This would explain why each additional input is slowing your script down.