I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.
Related
I need to navigate across 10,000 folders, collect some data from each folder, add it to 3 containers (c18, c17, c16, 3 initially empty lists each of which will be populated with 10,000 numbers) and it would take forever without parallellization.
My aim is to iterate through all folders with a for-loop (for i in range(10000)) and append 3 values extracted from each folder to c18, c17, c16 respectively, at each iteration of the for-loop.
I also want to display a progress bar - to know roughly how long would it take.
I have never parallelized a loop before or included a progress bar. I have tried to use SO. After reading some answers, I got to the point at which I wrote:
pool = multiprocessing.Pool(4)
pool.imap(funct, tqdm.tqdm(range(len(a0s))) # or pool.map(funct, tqdm.tqdm(range(len(a0s))))
len(a0s) yields 10,000.
The function funct is def funct(i): and does what I wrote above: for a given folder defined using the for-loop variable i (current iteration number), it does the job of extracting 3 values and appending them to c18, c17, c16.
I am calling pool.imap(funct, tqdm.tqdm(range(len(a0s))) inside a main() function and at the end of the .py script I wrote :
if __name__ == '__main__':
main()
I am importing:
import processing
import tqdm
However, all the above doesn't work.
How shall I proceed? Any help is welcomed.
Thanks!
a0s = np.loadtxt("Intensity_Wcm2_versus_a0_10_21_10_23_range.txt", usecols=(1,)) # has 10,000 entries
pool = multiprocessing.Pool(4)
top_folder_path = os.getcwd()
base_path = top_folder_path + "/a0_"
for i in range(len(a0s)):
results_folder = base_path + "{:.4f}".format(a0s[i])
if os.path.isdir(results_folder):
os.chdir(results_folder)
S = happi.Open(".")
pbb = S.ParticleBinning(0).get() # charge states diagnostic
c18.append(pbb['data'][-1][-1]) # first -1 is for last timestep recorded by diagnostic, second -1 is for last charge state (bare ions, Ar18+)
c17.append(pbb['data'][-1][-2])
c16.append(pbb['data'][-1][-2])
print("###########################################################]#########")
print("We have done the folder number: " + str(i) + " out of: " + str(len(a0s)))
os.chdir(top_folder_path)
else:
continue
def funct(i):
results_folder = base_path + "{:.4f}".format(a0s[i])
if os.path.isdir(results_folder):
os.chdir(results_folder)
S = happi.Open(".")
pbb = S.ParticleBinning(0).get() # charge states diagnosti
c18_val = pbb['data'][-1][-1]
c17_val = pbb['data'][-1][-2]
c16_val = pbb['data'][-1][-3]
c18.append(c18_val)
c17.append(c17_val)
c16.append(c16_val)
else:
return
def main():
pool.imap(funct, tqdm(range(len(a0s))))
if __name__ == '__main__':
main()
Here's a template for multiple progress bars and multiprocessing. Hope it helps. I set it up to expect to be updated 10 times in each process and added a sleep to be the parallelized "work".
import multiprocessing as mp
import tqdm
import time
from itertools import repeat
def funct(lock,i):
with lock:
bar = tqdm.tqdm(position=i,total=10,leave=False,ncols=100)
bar.set_lock(lock)
for _ in range(10):
time.sleep(.2)
bar.update(1)
bar.close()
return i*2
def main():
lock = mp.Manager().Lock()
with mp.Pool() as pool:
result = pool.starmap(funct, zip(repeat(lock),range(8)))
print()
print(result)
if __name__ == '__main__':
main()
Hello currently i am studying python and i wanted to know on how you can have a list that is being appended if there is a change constantly to a txtfile. Wording is a hard here is the code anyways
list=[]
random_number=0
file_handler=open("history.txt","w")
file_handler.write(str(list))
lenght_cumulative_data=len(list)
confirmed.append(random_number)
Now what i want to accomplish is that the list variable of the number 0 would be shown in history.txt but that doesnt happen and lets just imagine that random_number is always changing I want the list variable to be able to always update itself. Like if let say random_number changes to 1 and then 2 I want list to be updated to [0,1,2]. How do you do that? I've been searching on youtube and all they gave me is this write function is there anyway someone could refrence it or have any ideas?
from os import stat
from _thread import start_new_thread
from time import sleep
List = []
class WatchFileForChanges:
def __init__(self, filename):
self.file = filename
self.cached_file = stat(self.file).st_mtime
def watch(self):
num = 0
while 1:
status = stat(self.file).st_mtime
if status != self.cached_file:
self.cached_file = status
#file changed
List.append(num)
num += 1
def main():
Watcher = WatchFileForChanges("file.txt")
start_new_thread(Watcher.watch, ())
while 1:
print(List)
sleep(1)
if __name__ == '__main__':
main()
This will do what you want.
If I understood you correctly, you want to append to the list every time a file changes.
Note: this answer will only work on Windows
changes.py:
# Adapted from http://timgolden.me.uk/python/win32_how_do_i/watch_directory_for_changes.html
import threading
import os
import win32file
import win32con
ACTIONS = {
1 : "Created",
2 : "Deleted",
3 : "Updated",
4 : "Renamed from something",
5 : "Renamed to something"
}
# Thanks to Claudio Grondi for the correct set of numbers
FILE_LIST_DIRECTORY = 0x0001
def monitor_changes(callback, path, filenames):
path = path or ""
if type(filenames) == "str":
filenames = (filenames,)
thread = threading.Thread(target=_monitor, args=(callback, path, filenames))
thread.start()
return thread
def _monitor(callback, path, filenames):
hDir = win32file.CreateFile (
path,
FILE_LIST_DIRECTORY,
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE | win32con.FILE_SHARE_DELETE,
None,
win32con.OPEN_EXISTING,
win32con.FILE_FLAG_BACKUP_SEMANTICS,
None
)
while True:
#
# ReadDirectoryChangesW takes a previously-created
# handle to a directory, a buffer size for results,
# a flag to indicate whether to watch subtrees and
# a filter of what changes to notify.
#
# NB Tim Juchcinski reports that he needed to up
# the buffer size to be sure of picking up all
# events when a large number of files were
# deleted at once.
#
results = win32file.ReadDirectoryChangesW (
hDir,
1024,
True,
win32con.FILE_NOTIFY_CHANGE_LAST_WRITE,
None,
None
)
for action, file in results:
if filenames and file not in filenames and os.path.basename(file) not in filenames:
continue
callback(action, file)
if __name__ == '__main__':
# monitor by printing
t = monitor_changes(print, ".", None)
And in your main.py:
import changes
import os
my_list = []
def callback(action_id, filename):
# the function running means
# that the file has been modified
action_desc = changes.ACTIONS[action_id]
print(action_desc, filename)
with open(filename) as f:
my_list.append(f.read())
thread = changes.monitor_changes(callback, ".", "my_file_that_I_want_to_monitor.txt")
If you want to monitor all files in the directory, call monitor_changes with None as the third argument.
Note: this will monitor all subdirectories, so files with the same name but in different folders will trigger the callback. If you want to avoid this, then check the filename passed to your callback function is exactly what you want to monitor.
I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()
I may be approaching this all wrong but still this is where I'm at. I have very large log files I'm trying to search, up to 30gb in some cases. I'm writing a script to pull info and have been playing with multi process to speed it up a bit. right now I'm testing running two functions at the same time to search from the top and bottom to get results, which seems to work. I'm wondering if it's possible to stop one function one a result from the other. Such as if the top function finds a result they both stop. This way I can build it out as needed.
from file_read_backwards import FileReadBackwards
from multiprocessing import Process
import sys
z = "log.log"
#!/usr/bin/env python
rocket = 0
def top():
target = "test"
with open(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
def bottom():
target = "text"
with FileReadBackwards(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
if __name__=='__main__':
p1 = Process(target = top)
p1.start()
p2 = Process(target = bottom)
p2.start()
Here's a proof-of-concept of the approach I mentioned in the comments:
import os
import random
import sys
from multiprocessing import Process, Value
def search(proc_no, file_name, seek_to, max_size, find, flag):
stop_at = seek_to + max_size
with open(file_name) as f:
if seek_to:
f.seek(seek_to - 1)
prev_char = f.read(1)
if prev_char != '\n':
# Landed in the middle of a line. Skip back one (or
# maybe more) lines so this line isn't excluded. Start
# by seeking back 256 bytes, then 512 if necessary, etc.
exponent = 8
pos = seek_to
while pos >= seek_to:
pos = f.seek(max(0, pos - (2 ** exponent)))
f.readline()
pos = f.tell()
exponent += 1
while True:
if flag.value:
break
line = f.readline()
if not line:
break # EOF
data = line.strip()
if data == find:
flag.value = proc_no
print(data)
break
if f.tell() > stop_at:
break
if __name__ == '__main__':
# list.txt contains lines with the numbers 1 to 1000001
file_name = 'list.txt'
info = os.stat(file_name)
file_size = info.st_size
if len(sys.argv) == 1:
# Pick a random value from list.txt
num_lines = 1000001
choices = list(range(1, num_lines + 1))
choices.append('XXX')
find = str(random.choice(choices))
else:
find = sys.argv[1]
num_procs = 4
chunk_size, remainder = divmod(file_size, num_procs)
max_size = chunk_size + remainder
flag = Value('i', 0)
procs = []
print(f'Using {num_procs} processes to look for {find} in {file_name}')
for i in range(num_procs):
seek_to = i * chunk_size
proc = Process(target=search, args=(i + 1, file_name, seek_to, max_size, find, flag))
procs.append(proc)
for proc in procs:
proc.start()
for proc in procs:
proc.join()
if flag.value:
print(find, 'found by proc', flag.value)
else:
print(find, 'not found')
After reading various posts[1] about reading files with multiprocessing and multithreading, it seems that neither is a great approach due to potential disk thrashing and serialized reads. So here's a different, simpler approach that is way faster (at least for the file with a million lines I was trying it out on):
import mmap
import sys
def search_file(file_name, text, encoding='utf-8'):
text = text.encode(encoding)
with open(file_name) as f:
with mmap.mmap(f.fileno(), 0, flags=mmap.ACCESS_READ, prot=mmap.PROT_READ) as m:
index = m.find(text)
if index > -1:
# Found a match; now find beginning of line that
# contains match so we can grab the whole line.
while index > 0:
index -= 1
if m[index] == 10:
index += 1
break
else:
index = 0
m.seek(index)
line = m.readline()
return line.decode(encoding)
if __name__ == '__main__':
file_name, search_string = sys.argv[1:]
line = search_file(file_name, search_string)
sys.stdout.write(line if line is not None else f'Not found in {file_name}: {search_string}\n')
I'm curious how this would perform with a 30GB log file.
[1] Including this one
Simple example using a multiprocessing.Pool and callback function.
Terminates remaining pool processes once a result has returned.
You could add an arbitrary number of processes to search from different offsets in the file using this approach.
import math
import time
from multiprocessing import Pool
from random import random
def search(pid, wait):
"""Sleep for wait seconds, return PID
"""
time.sleep(wait)
return pid
def done(result):
"""Do something with result and stop other processes
"""
print("Process: %d done." % result)
pool.terminate()
print("Terminate Pool")
pool = Pool(2)
pool.apply_async(search, (1, math.ceil(random() * 3)), callback=done)
pool.apply_async(search, (2, math.ceil(random() * 3)), callback=done)
# do other stuff ...
# Wait for result
pool.close()
pool.join() # block our main thread
This is essentially the same as Blurp's answer, but I shortened it and made it a bit to make it more general. As you can see top should be an infinite loop, but bottom stops top immediately.
from multiprocessing import Process
valNotFound = True
def top():
i=0
while ValNotFound:
i += 1
def bottom():
ValNotFound = False
p1 = Process(target = top)
p2 = Process(target = bottom)
p1.start()
p2.start()
I am trying to write a python script scanning a folder and collect updated SQL script, and then automatically pull data for the SQL script. In the code, a while loop is scanning new SQL file, and send to data pull function. I am having trouble to understand how to make a dynamic queue with while loop, but also have multiprocess to run the tasks in the queue.
The following code has a problem that the while loop iteration will work on a long job before it moves to next iteration and collects other jobs to fill the vacant processor.
Update:
Thanks to #pbacterio for catching the bug, and now the error message is gone. After changing the code, the python code can take all the job scripts during one iteration, and distribute the scripts to four processors. However, it will get hang by a long job to go to next iteration, scanning and submitting the newly added job scripts. Any idea how to reconstruct the code?
I finally figured out the solution see answer below. It turned out what I was looking for is
the_queue = Queue()
the_pool = Pool(4, worker_main,(the_queue,))
For those stumble on the similar idea, following is the whole architecture of this automation script converting a shared drive to a 'server for SQL pulling' or any other job queue 'server'.
a. The python script auto_data_pull.py as shown in the answer. You need to add your own job function.
b. A 'batch script' with following:
start C:\Anaconda2\python.exe C:\Users\bin\auto_data_pull.py
c. Add a task triggered by start computer, run the 'batch script'
That's all. It works.
Python Code:
from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support
#
# Function run by worker processes
#
def worker(input, output):
for func, args in iter(input.get, 'STOP'):
result = compute(func, args)
output.put(result)
#
# Function used to compute result
#
def compute(func, args):
result = func(args)
return '%s says that %s%s = %s' % \
(current_process().name, func.__name__, args, result)
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
return files_dict
##### working in single thread
def single_thread():
path = "Y:/"
before = check_files(path)
sql_queue = []
while True:
time.sleep(3)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
# print sql_queue
for sql_file in sql_queue:
try:
query_sql(sql_file)
except:
pass
##### not working in queue
def multiple_thread():
NUMBER_OF_PROCESSES = 4
path = "Y:/"
sql_queue = []
before = check_files(path) # get the current dictionary of sql_files
task_queue = Queue()
done_queue = Queue()
while True: #while loop to check the changes of the files
time.sleep(5)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
# Create queues
#submit task
for task in TASKS:
task_queue.put(task)
for i in range(NUMBER_OF_PROCESSES):
p = Process(target=worker, args=(task_queue, done_queue)).start()
# try:
# p = Process(target=worker, args=(task_queue))
# p.start()
# except:
# pass
# Get and print results
print 'Unordered results:'
for i in range(len(TASKS)):
print '\t', done_queue.get()
# Tell child processes to stop
for i in range(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
# single_thread()
if __name__ == '__main__':
# freeze_support()
multiple_thread()
Reference:
monitor file changes with python script: http://timgolden.me.uk/python/win32_how_do_i/watch_directory_for_changes.html
Multiprocessing:
https://docs.python.org/2/library/multiprocessing.html
Where did you define sql_file in multiple_thread() in
multiprocessing.Process(target=query_sql, args=(sql_file)).start()
You have not defined sql_file in the method and moreover you have used that variable in a for loop. The variable's scope is only confined to the for loop.
Try replacing this:
result = func(*args)
by this:
result = func(args)
I have figured this out. Thank your for the response inspired the thought.
Now the script can run a while loop to monitor the folder for new updated/added SQL script, and then distribute the data pulling to multiple threads. The solution comes from the queue.get(), and queue.put(). I assume the queue object takes care of the communication by itself.
This is the final code --
from glob import glob
import os, time
import sys
import pypyodbc
from multiprocessing import Process, Queue, Event, Pool, current_process, freeze_support
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
try:
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
except:
pass
return files_dict
def worker_main(queue):
print os.getpid(),"working"
while True:
item = queue.get(True)
query_sql(item)
def main():
the_queue = Queue()
the_pool = Pool(4, worker_main,(the_queue,))
path = "Y:/"
before = check_files(path) # get the current dictionary of sql_files
while True: #while loop to check the changes of the files
time.sleep(5)
sql_queue = []
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
if sql_queue:
for jsl_file in sql_queue:
try:
the_queue.put(jsl_file)
except:
print "{0} failed with error {1}. \n".format(jsl_file, str(sys.exc_info()[0]))
pass
else:
pass
if __name__ == "__main__":
main()