I am trying to compress around 95 files each of size 7 gigs using python multiprocessing module:
import os;
from shutil import copyfileobj;
import bz2;
import multiprocessing as mp
import pprint
from numpy.core.test_rational import numerator
''' Input / Output Path '''
ipath = 'E:/AutoConfirm/'
opath = 'E:/compressed-autoconfirm/'
''' Number of Processes '''
num_of_proc = 6
def compressFile(fileName,chunkSize=100000000):
global ipath
print 'Started Compressing %s to %s'%(fileName,opath)
inp = open(ipath+fileName,'rb')
output = bz2.BZ2File(opath+fileName.split('/')[-1].strip('.csv')+'.bz2','wb',compresslevel=9)
copyfileobj(inp,output,chunkSize)
print 'Finished Compressing %s to %s'%(fileName,opath)
def process_worker(fileList):
for x in fileList:
compressFile(x)
def split_list(tempList):
a , reList = 0, []
global num_of_proc
for x in range(num_of_proc+1):
reList.append([tempList[a:a+len(tempList)/num_of_proc]])
a = a + len(tempList)/num_of_proc
return reList
pool = mp.Pool(processes=num_of_proc)
''' Prepare a list of all the file names '''
tempList = [x for x in os.listdir(ipath)]
''' Split the list into sub-lists
For example : if I have 90 files and I am using 6 processes
each of the process will work on 15 files each '''
iterList = split_list(tempList)
''' print iterList >> [ [filename1, filename2] , [filename3,filename4], ... ] '''
''' Pass the list consisting of sub-lists to pool '''
pool.map(process_worker,iterList)
The above code ends up creating 90 processes instead of 6. Can anyone help me identify the defect in the code.
Multiprocessing will re-import the module, so as everything is top level it does it all again, and again, and again.
You need to put the code in a function and call it.
def main():
...
if __name__ == '__main__':
main()
Related
I need to navigate across 10,000 folders, collect some data from each folder, add it to 3 containers (c18, c17, c16, 3 initially empty lists each of which will be populated with 10,000 numbers) and it would take forever without parallellization.
My aim is to iterate through all folders with a for-loop (for i in range(10000)) and append 3 values extracted from each folder to c18, c17, c16 respectively, at each iteration of the for-loop.
I also want to display a progress bar - to know roughly how long would it take.
I have never parallelized a loop before or included a progress bar. I have tried to use SO. After reading some answers, I got to the point at which I wrote:
pool = multiprocessing.Pool(4)
pool.imap(funct, tqdm.tqdm(range(len(a0s))) # or pool.map(funct, tqdm.tqdm(range(len(a0s))))
len(a0s) yields 10,000.
The function funct is def funct(i): and does what I wrote above: for a given folder defined using the for-loop variable i (current iteration number), it does the job of extracting 3 values and appending them to c18, c17, c16.
I am calling pool.imap(funct, tqdm.tqdm(range(len(a0s))) inside a main() function and at the end of the .py script I wrote :
if __name__ == '__main__':
main()
I am importing:
import processing
import tqdm
However, all the above doesn't work.
How shall I proceed? Any help is welcomed.
Thanks!
a0s = np.loadtxt("Intensity_Wcm2_versus_a0_10_21_10_23_range.txt", usecols=(1,)) # has 10,000 entries
pool = multiprocessing.Pool(4)
top_folder_path = os.getcwd()
base_path = top_folder_path + "/a0_"
for i in range(len(a0s)):
results_folder = base_path + "{:.4f}".format(a0s[i])
if os.path.isdir(results_folder):
os.chdir(results_folder)
S = happi.Open(".")
pbb = S.ParticleBinning(0).get() # charge states diagnostic
c18.append(pbb['data'][-1][-1]) # first -1 is for last timestep recorded by diagnostic, second -1 is for last charge state (bare ions, Ar18+)
c17.append(pbb['data'][-1][-2])
c16.append(pbb['data'][-1][-2])
print("###########################################################]#########")
print("We have done the folder number: " + str(i) + " out of: " + str(len(a0s)))
os.chdir(top_folder_path)
else:
continue
def funct(i):
results_folder = base_path + "{:.4f}".format(a0s[i])
if os.path.isdir(results_folder):
os.chdir(results_folder)
S = happi.Open(".")
pbb = S.ParticleBinning(0).get() # charge states diagnosti
c18_val = pbb['data'][-1][-1]
c17_val = pbb['data'][-1][-2]
c16_val = pbb['data'][-1][-3]
c18.append(c18_val)
c17.append(c17_val)
c16.append(c16_val)
else:
return
def main():
pool.imap(funct, tqdm(range(len(a0s))))
if __name__ == '__main__':
main()
Here's a template for multiple progress bars and multiprocessing. Hope it helps. I set it up to expect to be updated 10 times in each process and added a sleep to be the parallelized "work".
import multiprocessing as mp
import tqdm
import time
from itertools import repeat
def funct(lock,i):
with lock:
bar = tqdm.tqdm(position=i,total=10,leave=False,ncols=100)
bar.set_lock(lock)
for _ in range(10):
time.sleep(.2)
bar.update(1)
bar.close()
return i*2
def main():
lock = mp.Manager().Lock()
with mp.Pool() as pool:
result = pool.starmap(funct, zip(repeat(lock),range(8)))
print()
print(result)
if __name__ == '__main__':
main()
Hello currently i am studying python and i wanted to know on how you can have a list that is being appended if there is a change constantly to a txtfile. Wording is a hard here is the code anyways
list=[]
random_number=0
file_handler=open("history.txt","w")
file_handler.write(str(list))
lenght_cumulative_data=len(list)
confirmed.append(random_number)
Now what i want to accomplish is that the list variable of the number 0 would be shown in history.txt but that doesnt happen and lets just imagine that random_number is always changing I want the list variable to be able to always update itself. Like if let say random_number changes to 1 and then 2 I want list to be updated to [0,1,2]. How do you do that? I've been searching on youtube and all they gave me is this write function is there anyway someone could refrence it or have any ideas?
from os import stat
from _thread import start_new_thread
from time import sleep
List = []
class WatchFileForChanges:
def __init__(self, filename):
self.file = filename
self.cached_file = stat(self.file).st_mtime
def watch(self):
num = 0
while 1:
status = stat(self.file).st_mtime
if status != self.cached_file:
self.cached_file = status
#file changed
List.append(num)
num += 1
def main():
Watcher = WatchFileForChanges("file.txt")
start_new_thread(Watcher.watch, ())
while 1:
print(List)
sleep(1)
if __name__ == '__main__':
main()
This will do what you want.
If I understood you correctly, you want to append to the list every time a file changes.
Note: this answer will only work on Windows
changes.py:
# Adapted from http://timgolden.me.uk/python/win32_how_do_i/watch_directory_for_changes.html
import threading
import os
import win32file
import win32con
ACTIONS = {
1 : "Created",
2 : "Deleted",
3 : "Updated",
4 : "Renamed from something",
5 : "Renamed to something"
}
# Thanks to Claudio Grondi for the correct set of numbers
FILE_LIST_DIRECTORY = 0x0001
def monitor_changes(callback, path, filenames):
path = path or ""
if type(filenames) == "str":
filenames = (filenames,)
thread = threading.Thread(target=_monitor, args=(callback, path, filenames))
thread.start()
return thread
def _monitor(callback, path, filenames):
hDir = win32file.CreateFile (
path,
FILE_LIST_DIRECTORY,
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE | win32con.FILE_SHARE_DELETE,
None,
win32con.OPEN_EXISTING,
win32con.FILE_FLAG_BACKUP_SEMANTICS,
None
)
while True:
#
# ReadDirectoryChangesW takes a previously-created
# handle to a directory, a buffer size for results,
# a flag to indicate whether to watch subtrees and
# a filter of what changes to notify.
#
# NB Tim Juchcinski reports that he needed to up
# the buffer size to be sure of picking up all
# events when a large number of files were
# deleted at once.
#
results = win32file.ReadDirectoryChangesW (
hDir,
1024,
True,
win32con.FILE_NOTIFY_CHANGE_LAST_WRITE,
None,
None
)
for action, file in results:
if filenames and file not in filenames and os.path.basename(file) not in filenames:
continue
callback(action, file)
if __name__ == '__main__':
# monitor by printing
t = monitor_changes(print, ".", None)
And in your main.py:
import changes
import os
my_list = []
def callback(action_id, filename):
# the function running means
# that the file has been modified
action_desc = changes.ACTIONS[action_id]
print(action_desc, filename)
with open(filename) as f:
my_list.append(f.read())
thread = changes.monitor_changes(callback, ".", "my_file_that_I_want_to_monitor.txt")
If you want to monitor all files in the directory, call monitor_changes with None as the third argument.
Note: this will monitor all subdirectories, so files with the same name but in different folders will trigger the callback. If you want to avoid this, then check the filename passed to your callback function is exactly what you want to monitor.
I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()
I'm trying to build a list of parent/comment pairs from the publicly available Reddit data set.
I have a CSV file which I load into a Pandas dataframe which contains rows of the comments with the parent and child id, as well as the child comment. The data is loaded using the following block of code:
import os
import multiprocessing as mp
import numpy as np
import pandas as pd
sourcePATH = r'C:\'
workingFILE = r'\output-pt1.csv'
# filepaths
input_file = sourcePATH + workingFILE
data_df = pd.read_csv(input_file,header=None,names=['PostIDX','ParentIDX','Comment','Score','Controversiality'])
The aim is to scan through each row in the dataframe and using the parent id to search through the rest of the dataframe to see if their is a parent comment present. If it is I then store the child and parent comments in a tuple with some other information. This will then be added to a list which will then be written out to a csv file at the end. To do this I use the following code:
def checkChildParent(ParentIDX_curr, ChildIDX_curr,ChildComment_curr,ChildScore_curr,ChildCont_curr):
idx = data_df.loc[data_df['PostIDX'] == ParentIDX_curr]
if idx.empty is False:
ParentComment = idx.iloc[0,2]
ParentScore = idx.iloc[0,3]
ParentCont = idx.iloc[0,4]
outPut.put([ParentIDX_curr[0], ParentComment,ParentScore,ParentCont,ChildIDX_curr[0], ChildComment_curr[0],ChildScore_curr[0],ChildCont_curr[0]])
if __name__ == '__main__':
print('Process started')
t_start_init = time.time()
t_start = time.time()
noCores = 1
#pool = mp.Pool(processes=noCores)
update_freq = 100
n = 1000
#n = round(len(data_df)/8)
flag_create = 0
flag_run = 0
i = 0
outPut = mp.Queue()
#parent_child_df = pd.DataFrame()
#parent_child_df.coumns = ['PostIDX','ParentIDX']
while i < n:
#print(i)
procs = []
ParentIDX = []
ParentComment = []
ParentScore = []
ParentCont = []
ChildIDX = []
ChildComment = []
ChildScore = []
ChildCont = []
for worker in range(0,noCores):
ParentIDX.append(data_df.iloc[i,1])
ChildIDX.append(data_df.iloc[i,0])
ChildComment.append(data_df.iloc[i,2])
ChildScore.append(data_df.iloc[i,3])
ChildCont.append(data_df.iloc[i,4])
i = i + 1
#when I call the function this way it returns the expected matches
#checkChildParent(ParentIDX,ChildIDX,ChildComment,
# ChildScore,ChildCont)
#when I call the function with Process function nothing appears to be happening
for proc in range(0,noCores):
p = mp.Process(target = checkChildParent, args=(ParentIDX[proc],ChildIDX[proc],ChildComment[proc],ChildScore[proc],ChildCont[proc]))
procs.append(p)
p.start()
#for p in procs:
# p.join()
if outPut.empty() is False:
print(outPut.get())
At the top of the file is a function which scans the dataframe for a given row and returns the tuple of the matched parent and child comment if it was found. If I call this function normally then it works fine, however when I call the function using the Process function it doesn't match anything!. I'm guessing its the form the arguments that are being passed to the function is being passed to the function that is causing the issue, but I have been trying to debug this all afternoon and have failed so far. If anyone has any suggestions then please let me know!
Thanks!
I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.