Threading NNTP, how? (Newbie here) - python

I can't wrap my head around how I could possibly rewrite my code to be multi-threaded.
The code I'm writing is made to automatically archive every single article in a list of newsgroups that exist, but I wanna be able to utilize my newsgroup plan and make it up to 20 threads. I've never coded threading before and my attempts were in vein.
Here's my code, excluding the username and pass ( but you can get a free account with max 5 threads if you really want to at https://my.xsusenet.com )
Please don't judge me too hard :(
import nntplib
import sys
import datetime
import os
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
resp, count, first, last, name = s.group(group)
for message_id in range(first, last):
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {message_id}")
outfile = open('.\\' + group + '\\' + str(message_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
Tried threading using a ThreadPoolExecutor, to cause it to use 20 threads, and failed, caused it to repeat the same process to the same message id. The expected result was to download 20 different messages at a time.
Here's the code I tried with threading, mind you I did like 6-8 variations of it to try and get it to work, this was the last one before I gave up to ask on here.
import nntplib
import sys
import datetime
import os
import concurrent.futures
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
def download_nntp_file(mess_id):
resp, count, first, last, name = s.group(group)
message_id = range(first, last)
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {mess_id}")
outfile = open('.\\' + group + '\\' + str(mess_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = executor.submit(download_nntp_file)

I can't test it with XSUseNet.
I wouldn't use global variables because when processes work at the same time then they may get the same values from these variables.
You should rather send values as parameters to functions.
Something like this:
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
for g_tuple in groups_list_tuple:
executor.submit(download_nntp_file, g_tuple)
But I would be simpler to use map() instead of submit() because it gets list with arguments and it doesn't need for-loop
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_nntp_file, groups_list_tuple)

Related

Python multicore CSV short program, advice/help needed

I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()

Python: IOError 110 Connection timed out when reading from disk

I'm running a Python script on a Sun Grid Engine supercompute cluster that reads in a list of file ids, sends each to a worker process for analysis, and writes one output per input file to disk.
The trouble is I'm getting IOError(110, 'Connection timed out') somewhere inside the worker function, and I'm not sure why. I've received this error in the past when making network requests that were severely delayed, but in this case the worker is only trying to read data from disk.
My question is: What would cause a Connection timed out error when reading from disk, and how can one resolve this error? Any help others can offer would be very appreciated.
Full script (the IOError crops up in minhash_text()):
from datasketch import MinHash
from multiprocessing import Pool
from collections import defaultdict
from nltk import ngrams
import json
import sys
import codecs
import config
cores = 24
window_len = 12
step = 4
worker_files = 50
permutations = 256
hashband_len = 4
def minhash_text(args):
'''Return a list of hashband strings for an input doc'''
try:
file_id, path = args
with codecs.open(path, 'r', 'utf8') as f:
f = f.read()
all_hashbands = []
for window_idx, window in enumerate(ngrams(f.split(), window_len)):
window_hashbands = []
if window_idx % step != 0:
continue
minhash = MinHash(num_perm=permutations, seed=1)
for ngram in set(ngrams(' '.join(window), 3)):
minhash.update( ''.join(ngram).encode('utf8') )
hashband_vals = []
for i in minhash.hashvalues:
hashband_vals.append(i)
if len(hashband_vals) == hashband_len:
window_hashbands.append( '.'.join([str(j) for j in hashband_vals]) )
hashband_vals = []
all_hashbands.append(window_hashbands)
return {'file_id': file_id, 'hashbands': all_hashbands}
except Exception as exc:
print(' ! error occurred while processing', file_id, exc)
return {'file_id': file_id, 'hashbands': []}
if __name__ == '__main__':
file_ids = json.load(open('file_ids.json'))
file_id_path_tuples = [(file_id, path) for file_id, path in file_ids.items()]
worker_id = int(sys.argv[1])
worker_ids = list(ngrams(file_id_path_tuples, worker_files))[worker_id]
hashband_to_ids = defaultdict(list)
pool = Pool(cores)
for idx, result in enumerate(pool.imap(minhash_text, worker_ids)):
print(' * processed', idx, 'results')
file_id = result['file_id']
hashbands = result['hashbands']
for window_idx, window_hashbands in enumerate(hashbands):
for hashband in window_hashbands:
hashband_to_ids[hashband].append(file_id + '.' + str(window_idx))
with open(config.out_dir + 'minhashes-' + str(worker_id) + '.json', 'w') as out:
json.dump(dict(hashband_to_ids), out)
It turned out I was hammering the filesystem too hard, making too many concurrent read requests for files on the same server. That server could only allow a fixed number of reads in a given period, so any requests over that limit received a Connection Timed Out response.
The solution was to wrap each file read request in a while loop. Inside that while loop, try to read the appropriate file from disk. If the Connection timed out error springs, sleep for a second and try again. Only once the file has been read may the while loop be broken.

Python script use while loop to keep updating job scripts and multiprocess the tasks in queue

I am trying to write a python script scanning a folder and collect updated SQL script, and then automatically pull data for the SQL script. In the code, a while loop is scanning new SQL file, and send to data pull function. I am having trouble to understand how to make a dynamic queue with while loop, but also have multiprocess to run the tasks in the queue.
The following code has a problem that the while loop iteration will work on a long job before it moves to next iteration and collects other jobs to fill the vacant processor.
Update:
Thanks to #pbacterio for catching the bug, and now the error message is gone. After changing the code, the python code can take all the job scripts during one iteration, and distribute the scripts to four processors. However, it will get hang by a long job to go to next iteration, scanning and submitting the newly added job scripts. Any idea how to reconstruct the code?
I finally figured out the solution see answer below. It turned out what I was looking for is
the_queue = Queue()
the_pool = Pool(4, worker_main,(the_queue,))
For those stumble on the similar idea, following is the whole architecture of this automation script converting a shared drive to a 'server for SQL pulling' or any other job queue 'server'.
a. The python script auto_data_pull.py as shown in the answer. You need to add your own job function.
b. A 'batch script' with following:
start C:\Anaconda2\python.exe C:\Users\bin\auto_data_pull.py
c. Add a task triggered by start computer, run the 'batch script'
That's all. It works.
Python Code:
from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support
#
# Function run by worker processes
#
def worker(input, output):
for func, args in iter(input.get, 'STOP'):
result = compute(func, args)
output.put(result)
#
# Function used to compute result
#
def compute(func, args):
result = func(args)
return '%s says that %s%s = %s' % \
(current_process().name, func.__name__, args, result)
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
return files_dict
##### working in single thread
def single_thread():
path = "Y:/"
before = check_files(path)
sql_queue = []
while True:
time.sleep(3)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
# print sql_queue
for sql_file in sql_queue:
try:
query_sql(sql_file)
except:
pass
##### not working in queue
def multiple_thread():
NUMBER_OF_PROCESSES = 4
path = "Y:/"
sql_queue = []
before = check_files(path) # get the current dictionary of sql_files
task_queue = Queue()
done_queue = Queue()
while True: #while loop to check the changes of the files
time.sleep(5)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
# Create queues
#submit task
for task in TASKS:
task_queue.put(task)
for i in range(NUMBER_OF_PROCESSES):
p = Process(target=worker, args=(task_queue, done_queue)).start()
# try:
# p = Process(target=worker, args=(task_queue))
# p.start()
# except:
# pass
# Get and print results
print 'Unordered results:'
for i in range(len(TASKS)):
print '\t', done_queue.get()
# Tell child processes to stop
for i in range(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
# single_thread()
if __name__ == '__main__':
# freeze_support()
multiple_thread()
Reference:
monitor file changes with python script: http://timgolden.me.uk/python/win32_how_do_i/watch_directory_for_changes.html
Multiprocessing:
https://docs.python.org/2/library/multiprocessing.html
Where did you define sql_file in multiple_thread() in
multiprocessing.Process(target=query_sql, args=(sql_file)).start()
You have not defined sql_file in the method and moreover you have used that variable in a for loop. The variable's scope is only confined to the for loop.
Try replacing this:
result = func(*args)
by this:
result = func(args)
I have figured this out. Thank your for the response inspired the thought.
Now the script can run a while loop to monitor the folder for new updated/added SQL script, and then distribute the data pulling to multiple threads. The solution comes from the queue.get(), and queue.put(). I assume the queue object takes care of the communication by itself.
This is the final code --
from glob import glob
import os, time
import sys
import pypyodbc
from multiprocessing import Process, Queue, Event, Pool, current_process, freeze_support
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
try:
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
except:
pass
return files_dict
def worker_main(queue):
print os.getpid(),"working"
while True:
item = queue.get(True)
query_sql(item)
def main():
the_queue = Queue()
the_pool = Pool(4, worker_main,(the_queue,))
path = "Y:/"
before = check_files(path) # get the current dictionary of sql_files
while True: #while loop to check the changes of the files
time.sleep(5)
sql_queue = []
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
if sql_queue:
for jsl_file in sql_queue:
try:
the_queue.put(jsl_file)
except:
print "{0} failed with error {1}. \n".format(jsl_file, str(sys.exc_info()[0]))
pass
else:
pass
if __name__ == "__main__":
main()

Python MemoryError with Queue and threading

I'm currently writing a script that reads reddit comments from a large file (5 gigs compressed, ~30 gigs of data being read). My script reads the comments, checks for some text, parses them, and sends them off to a Queue function (running in a seperate thread). No matter what I do, I always get a MemoryError on a specific iteration (number 8162735 if it matters in the slightest). And I can't seem to handle the error, Windows just keeps shutting down python when it hits. Here's my script:
import ujson
from tqdm import tqdm
import bz2
import json
import threading
import spacy
import Queue
import time
nlp = spacy.load('en')
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in (enumerate(file_)):
yield ujson.loads(line)['body']
objects = iter_comments('RC_2015-01.bz2')
q = Queue.Queue()
f = open("reddit_dump.bin", 'wb')
def worker():
while True:
item = q.get()
f.write(item)
q.task_done()
for i in range(0, 2):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
def finish_parse(comment):
global q
try:
comment_parse = nlp(unicode(comment))
comment_bytes = comment_parse.to_bytes()
q.put(comment_bytes)
except MemoryError:
print "MemoryError with comment {0}, waiting for Queue to empty".format(comment)
time.sleep(2)
except AssertionError:
print "AssertionError with comment {0}, skipping".format(comment)
for comment in tqdm(objects):
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
q.join()
f.close()
Does anybody know what I'm doing wrong?
Looks like its not in your code but may be in the data. Have you tried to skip that iteration?
x = 0
for comment in tqdm(objects):
x += 1
if x != 8162735
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()

Successive multiprocessing

I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.

Categories

Resources