I want to make an inverted index using multiprocessing to speed up its work. My idea is to split the files into groups, and each process will build its own inverted index, and then I want to merge all these indexes into one inverted index. But I don't know how to return them to the main process that will merge them.
import multiprocessing as mp
from pathlib import Path
import re
import time
class InvertedIndex:
def __init__(self):
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt'))
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num
processes = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
currLi = pathList[startIndex:endIndex]
p = mp.Process(target=self.oneProcessTask, args=(currLi,))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
#staticmethod
def oneProcessTask(listOfDoc):
#print(f'Start: {list[0]}, end: {list[-1]}') # temp
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
def getListOfDoc(self, keyWord):
return self.index[keyWord]
if __name__ == '__main__':
ii = InvertedIndex()
start_time = time.time()
ii.createIndex()
print("--- %s seconds ---" % (time.time() - start_time))
I used multiprocessing.manager to write everything in one dictionary, but that solution was too slow. So I went back to the idea of creating own inverted index for each process and then merging them. But I don't know how to return all indexes to one process.
Take a look at concurrent.futures (native library) with either ThreadPoolExecutor or ProcessPoolExecutor. FYI: I wrote on that in here and did not test but, this is more or less the jist of what I use all the time.
from concurrent.futures import ThreadPoolExecutor, as_completed
def foo(stuff: int) -> dict:
return {}
things_to_analyze = [1,2,3]
threads = []
results = []
with ThreadPoolExecutor() as executor:
for things in things_to_analyze:
threads.append(executor.submit(foo, thing))
for job in as_completed(threads):
results.append(job.results())
I found a solution. I used pool.starmap to return a list of indexes.
My code:
class InvertedIndex:
def __init__(self):
self.smallIndexes = None
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt')) # Рекурсивно проходимо по всіх текстових файлах і робимо з них список
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num # Розраховуємо скільки файлів має обробити один процес
processes_args = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
processes_args.append((path, startIndex, endIndex))
pool = mp.Pool(threads_num)
self.smallIndexes = pool.starmap(self.oneProcessTask, processes_args)
self.mergeIndex()
#staticmethod
def oneProcessTask(path, startIndex, endIndex):
pathList = list(Path(path).glob('**/*.txt'))
listOfDoc = pathList[startIndex:endIndex]
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
return tempDict
Execution time decreased from 200 seconds (when I used shared memory and menger.dict ) to 0.8 seconds (when I used pool.starmap).
I have problem with multiprocessing code below, after running it in Jupyter notebook program freeze and kernel restarting is requited. Data variable contains 60x2001 DataFrame, some of this data is extracted to dict_results as key of dictionary and variable score as value.
import multiprocessing as mp
def search_regression_loop(data, row, dict_results):
for row2 in range(2000):
if row != row2:
score = linear_model_pred_iloc(data.iloc[:,:51], row, row2)
if (score > 0.30) and (score != 1):
if (data.iloc[row,54] is not None) and (data.iloc[row2,54] is not None):
name =str(data.iloc[row,54]) + ' - ' + str(data.iloc[row2,54])
dict_results[name] = score
else:
name = str(data.iloc[row].name) + ' - ' + str(data.iloc[row2].name)
dict_results[name] = score
manager = mp.Manager()
dict_results = manager.dict()
procs = []
for row in tqdm(range(2000)):
p = mp.Process(target=search_regression_loop, args=(tumor,row,dict_results))
procs.append(p)
p.start()
for p in procs:
p.join()
I have around 1500 csv files with OHLC data of stock which contains 90000-100000 rows each.
Below the multiprocessing code to process each of the files ( with number of iterations ). When I tried to use 16 processess, my system started to hang a bit. I am very sure that its because of high use of I/O devices ( since system has to open each and every file ). Is it a good idea to save all the 1500 csv files to one one Dictionary and then run the code ? Can it reduce the time or slow down the hanging process ?
Also, system is working fine on 10 processes.
Here is the ohlc data look like -
enter image description here
import numpy as np
import pandas as pd
import os
import multiprocessing
import datetime
import itertools
import time
import warnings
warnings.filterwarnings('ignore')
# bank nifty
bn_futures = pd.read_csv('E:\\Tanmay\\Data\\Bank Nifty Index\\BankNifty_Futures GFDL 2011-2020.csv')
bn_futures['Date_time'] = bn_futures['Date'] + ' ' + bn_futures['Time']
bn_futures['Date_time'] = pd.to_datetime(bn_futures['Date_time'],format='%Y-%m-%d %H:%M:%S')
bn_futures = bn_futures[bn_futures['Date_time'].dt.date > datetime.date(2016,5,26)]
req_cols = [x for x in bn_futures.columns if 'Unnamed' not in x]
bn_futures = bn_futures[req_cols]
bn_futures['straddle'] = round(bn_futures['Close'],-2)
bn_futures['straddle'] = bn_futures['straddle'].astype(int)
bn_futures['straddle'] = bn_futures['straddle'].astype(str)
bn_futures['Date'] = bn_futures['Date_time'].dt.date
dates = list(set(bn_futures['Date'].to_list()))
dates.sort()
option_files1 = os.listdir('E:\\\\2nd Set\\')
option_files = []
for i in option_files1:
if datetime.datetime.strptime(i.split('.')[0],'%Y-%m-%d').date() >= datetime.date(2016,5,27):
option_files.append(i)
def time_loop(start_time,end_time,timeframe):
start_datetime = datetime.datetime.combine(datetime.datetime.today().date(),start_time)
end_datetime = datetime.datetime.combine(datetime.datetime.today().date(),end_time)
difference = int((((end_datetime - start_datetime).total_seconds())/60)/timeframe)
final_time_list = []
for i in range(difference):
final_time_list.append((start_datetime+datetime.timedelta(minutes=i*timeframe)).time())
return final_time_list
entry_time_list = time_loop(datetime.time(9,19),datetime.time(15,19),5)
sl_list = np.arange(1.1, 2, 0.1)
# sl_list = list(range(1.1,2,0.1))
paramlist = list(itertools.product(entry_time_list,sl_list))
def strategy(main_entry_time,sl):
print(main_entry_time,sl)
main_dict = {}
for file in option_files:
date = datetime.datetime.strptime(file.split('.')[0],'%Y-%m-%d').date()
try:
# reading current date bn futures
bn = bn_futures[bn_futures['Date'] == date]
# reading main time bn futures
b = bn[bn['Date_time'].dt.time == main_entry_time]
straddle_value = b['straddle'].iloc[0]
df = pd.read_csv('E:\\Tanmay\\Data\\Bank nifty Intraday All expiries\\2nd Set\\'+file)
df['Date_time'] = pd.to_datetime(df['Date_time'],format='%Y-%m-%d %H:%M:%S')
h = [k for k in df.columns if 'Un' not in k]
df = df[h]
total_df = df[(df['Ticker'].str.contains(straddle_value)) & (df['Expiry_number'] == 0) & (df['W/M'] == 'W')]
option_types = ['CE','PE']
for option in option_types:
option_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time == main_entry_time)]
entry_price = option_df['Close'].iloc[0]
strike = option
entry_time = main_entry_time
trade_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time > main_entry_time)]
trade_df.sort_values(by='Date_time',inplace=True)
for t in trade_df.index:
if trade_df['Date_time'][t].time() > entry_time:
if trade_df['High'][t] > entry_price * sl:
exit_price = entry_price * sl
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'SL'}
break
if trade_df['Date_time'][t].time() >= datetime.time(15,14,0):
exit_price = trade_df['Close'][t]
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'EOD'}
break
except Exception as yy:
pass
final_dict = dict(main_dict)
final_df = pd.DataFrame(final_dict)
final_df = final_df.transpose()
final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
if __name__=='__main__':
start_time = time.time()
# mgr = multiprocessing.Manager()
# main_dict = mgr.dict()
total_data = paramlist
p = multiprocessing.Pool(processes=10)
p.starmap(strategy,total_data)
p.close()
Before you can improve the multiprocessing performance, you should make sure your serial implementation is as efficient as it can be. Have you done that?
Your strategy method now is rereading every option file repeatedly for each element of total_data that is being passed to it. This is highly inefficient but moreover it might be contributing significantly to what is stalling your I/O (it depends on caching, which I discuss later). What if the data was put in a database and read up front and perhaps stored in a dictionary initialized at the beginning?
As far as strategy writing out the CSV file, it should be returning the input parameters and the final_df back to the main process so it can do all the I/O. For this function imap_unordered with a suitable chunksize argument is better suited so that the main process can write the results as they become available. Because we are no longer using method starmap, strategy will now be passed a tuple that will have to be unpacked:
import numpy as np
import pandas as pd
import os
import multiprocessing
import datetime
import itertools
import time
import warnings
warnings.filterwarnings('ignore')
# bank nifty
if __name__ == '__main__':
bn_futures = pd.read_csv('E:\\Tanmay\\Data\\Bank Nifty Index\\BankNifty_Futures GFDL 2011-2020.csv')
bn_futures['Date_time'] = bn_futures['Date'] + ' ' + bn_futures['Time']
bn_futures['Date_time'] = pd.to_datetime(bn_futures['Date_time'],format='%Y-%m-%d %H:%M:%S')
bn_futures = bn_futures[bn_futures['Date_time'].dt.date > datetime.date(2016,5,26)]
req_cols = [x for x in bn_futures.columns if 'Unnamed' not in x]
bn_futures = bn_futures[req_cols]
bn_futures['straddle'] = round(bn_futures['Close'],-2)
bn_futures['straddle'] = bn_futures['straddle'].astype(int)
bn_futures['straddle'] = bn_futures['straddle'].astype(str)
bn_futures['Date'] = bn_futures['Date_time'].dt.date
dates = list(set(bn_futures['Date'].to_list()))
dates.sort()
option_files1 = os.listdir('E:\\\\2nd Set\\')
option_files = []
for i in option_files1:
if datetime.datetime.strptime(i.split('.')[0],'%Y-%m-%d').date() >= datetime.date(2016,5,27):
option_files.append(i)
def time_loop(start_time,end_time,timeframe):
start_datetime = datetime.datetime.combine(datetime.datetime.today().date(),start_time)
end_datetime = datetime.datetime.combine(datetime.datetime.today().date(),end_time)
difference = int((((end_datetime - start_datetime).total_seconds())/60)/timeframe)
final_time_list = []
for i in range(difference):
final_time_list.append((start_datetime+datetime.timedelta(minutes=i*timeframe)).time())
return final_time_list
entry_time_list = time_loop(datetime.time(9,19),datetime.time(15,19),5)
sl_list = np.arange(1.1, 2, 0.1)
# sl_list = list(range(1.1,2,0.1))
paramlist = list(itertools.product(entry_time_list,sl_list))
def init_pool_processes(o_f):
global option_files
option_files = o_f
def strategy(tpl):
main_entry_time, sl = tpl # unpack tuple
print(main_entry_time,sl)
main_dict = {}
for file in option_files:
date = datetime.datetime.strptime(file.split('.')[0],'%Y-%m-%d').date()
try:
# reading current date bn futures
bn = bn_futures[bn_futures['Date'] == date]
# reading main time bn futures
b = bn[bn['Date_time'].dt.time == main_entry_time]
straddle_value = b['straddle'].iloc[0]
df = pd.read_csv('E:\\Tanmay\\Data\\Bank nifty Intraday All expiries\\2nd Set\\'+file)
df['Date_time'] = pd.to_datetime(df['Date_time'],format='%Y-%m-%d %H:%M:%S')
h = [k for k in df.columns if 'Un' not in k]
df = df[h]
total_df = df[(df['Ticker'].str.contains(straddle_value)) & (df['Expiry_number'] == 0) & (df['W/M'] == 'W')]
option_types = ['CE','PE']
for option in option_types:
option_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time == main_entry_time)]
entry_price = option_df['Close'].iloc[0]
strike = option
entry_time = main_entry_time
trade_df = total_df[(total_df['Ticker'].str.contains(option)) & (total_df['Date_time'].dt.time > main_entry_time)]
trade_df.sort_values(by='Date_time',inplace=True)
for t in trade_df.index:
if trade_df['Date_time'][t].time() > entry_time:
if trade_df['High'][t] > entry_price * sl:
exit_price = entry_price * sl
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'SL'}
break
if trade_df['Date_time'][t].time() >= datetime.time(15,14,0):
exit_price = trade_df['Close'][t]
exit_time = trade_df['Date_time'][t].time()
profit = entry_price - exit_price - 0.02* entry_price
main_dict['SL_'+str(sl)+'entry_time_'+str(main_entry_time)+'entry_date_'+str(date)+'_'+option] = {'Entry_date':str(date),'Entry_time':entry_time,'Strike':str(straddle_value)+option,'Entry_price':entry_price,'Exit_price':exit_price,'exit_time':exit_time,'profit':profit,'Reason':'EOD'}
break
except Exception as yy:
pass
#final_dict = dict(main_dict) # why make a copy?
final_df = pd.DataFrame(main_dict)
final_df = final_df.transpose()
#final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
return (sl, date, final_df)
def compute_chunksize(iterable_size, pool_size):
chunksize, remainder = divmod(iterable_size, 4 * pool_size)
if remainder:
chunksize += 1
return chunksize
if __name__=='__main__':
start_time = time.time()
# mgr = multiprocessing.Manager()
# main_dict = mgr.dict()
total_data = paramlist
POOL_SIZE = 10
p = multiprocessing.Pool(processes=POOL_SIZE, initializer=init_pool_processes, initargs=(option_list,))
chunksize = compute_chunksize(len(total_data), POOL_SIZE)
results = p.imap_unordered(strategy, total_data, chunksize=chunksize)
for sl, date, final_df in results:
final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
p.close()
p.join()
Since your are running under Windows, as I have mentioned before, code at global scope will be executed by each pool process as part of its initialization and so it is inefficient to have code that is not required by your worker function, strategy at global scope that is not contained within a if __name__ == '__main__': block. So that is what I have done. Since your worker function does need to reference (for the time being until the issue I initially raised is addressed) option_files, I have used the initializer and initargs arguments of the Pool constructor so that after the option_files list is created once by the main process only, it will be copied to each process in the pool used to initialize a global variable option_files.
But I cannot stress enough that you should figure out a way of eliminating the reading of the files in the option_files list repeatedly. Ideally, building a dictionary of some sort that can be passed as another argument to init_process_pools so that each pool process has access to a copy of the dictionary once constructed would be ideal. What might save you is that Windows will cache data. Depending on the cache size and the size of the CSV files, the I/O bottleneck may not be as big as a problem as it might otherwise be.
In the meanwhile, you could experiment and force "single threading" of reading and writing of data by using a multirprocessing.Lock with the following changes. If Windows is caching all the reads after the first time you read all the option files, it will probably not make too big a difference. The posted code above, which single threads the writing, however, should help.
if __name__=='__main__':
start_time = time.time()
# mgr = multiprocessing.Manager()
# main_dict = mgr.dict()
total_data = paramlist
POOL_SIZE = 10
io_lock = multiprocessing.Lock()
p = multiprocessing.Pool(processes=POOL_SIZE, initializer=init_pool_processes, initargs=(option_list, io_lock))
chunksize = compute_chunksize(len(total_data), POOL_SIZE)
results = p.imap_unordered(strategy, total_data, chunksize=chunksize)
for sl, date, final_df in results:
with io_lock:
final_df.to_csv('SL_'+str(sl)+'entry_time_'+str(main_entry_time).replace(':','')+'entry_date_'+str(date)+'.csv')
p.close()
p.join()
And:
def init_pool_processes(o_f, lock):
global option_files, io_lock
option_files = o_f
io_lock = lock
And finally:
def stratgey(tpl):
...
straddle_value = b['straddle'].iloc[0]
with io_lock:
df = pd.read_csv('E:\\Tanmay\\Data\\Bank nifty Intraday All expiries\\2nd Set\\'+file)
I have a table with 38 millions rows. One column is an url. I need to go to that url for each row, extract and process the xml to save it on a new column.
I parallelized the entire process into chunks with Pool and I am using pandas dataframes as well. I am using 100% 8-core capacity with 1000mps internet and the process is calculated to end in 12 days.
Any advice on how to improve this?
class Receptores():
def aux_tupla(self, df):
df['aux_tupla'] = df['uri'].str.replace('/v01/', '/depot/').apply(lambda x: self.uriToDicts3(x))
return df
def uriToDicts3(self, url):
regex = self.URL_REGEX.match(url)
path = "%s/%s/%s/%s.gz" % (regex.group(1), regex.group(2), regex.group(4), regex.group(5))
_file = self.bucket.get_key(path, validate=False)
compressed_file = BytesIO()
try:
_file.get_file(compressed_file)
compressed_file.seek(0)
decompressed_file = gzip.GzipFile(fileobj=compressed_file, mode='rb')
rq = decompressed_file.read()
except boto.exception.S3ResponseError as ex:
print("Error >>", ex.message)
return json.dumps({}), json.dumps({})
soup = bs(rq, 'xml')
detalle = soup.find('Detalle')
detalle = json.dumps(xmltodict.parse(str(detalle)))
dictionary = {}
for key in self.datos_adicionales:
try:
value = soup.find(key)
if value is None:
value = soup.find(text=re.compile(key)).parent.parent.find('ValorDA').get_text()
else:
value = value.get_text()
dictionary[key] = value
except Exception:
continue
dictionary = json.dumps(dictionary)
return dictionary, detalle
def pool_only(self, df):
df_split = np.array_split(df, 8)
pool = Pool(8)
df = pd.concat(pool.map(self.aux_tupla, df_split))
pool.close()
pool.join()
return df
def main(self, dia, choice='pool'):
t1 = time.time()
df = self.getUris(dia, limit=True)
print('FINISHED: {} get Uris in {}'.format(dia, time.time() - t1))
if choice == 'pool':
df = self.pool_only(df)
elif choice == 'combined':
self.pool(df)
df = pd.concat(self.dfs)
print([i.shape[0] for i in self.dfs])
elif choice == 'thread':
self.thread_only(df)
df = pd.concat(self.dfs)
print([i.shape[0] for i in self.dfs])
else:
df['aux_tupla'] = df['uri'].str.replace('/v01/', '/depot/').apply(lambda x: self.uriToDicts3(x))
print('FINISHED: {} , {} rows uriToDicts3 in {} hours'.format(dia, df.shape[0], (time.time() - t1) / 3600))
df[['data_adicional', 'detalle']] = df['aux_tupla'].apply(pd.Series)
df.drop('aux_tupla', axis=1, inplace=True)
# self.insert_table(df)
return df
def parallel(dia):
t1 = time.time()
a = Receptores().main(dia, choice='pool')
a.to_csv('{}.csv'.format(dia), index=False)
# print('LISTO {} - {}'.format(dia, time.time() - t1))
return a
if __name__ == '__main__':
t1 = time.time()
# df = pd.read_csv('dia_emision_batch.csv')
# dias = [str(i) for i in df.loc[:, 'dia_emision']]
dias = ['20180101', '20170910', '20170730']
for i in dias:
if os.path.exists('{}.csv'.format(i)):
print('Already exists:', i)
continue
try:
parallel(i)
except Exception:
print('Failed!', i)
print('TOTAL TIME: {}'.format((time.time() - t1) / 3600))
I have the following code which converts graph from edges list to adjacency matrix:
for line in open('graph.txt'):
converted = [sparse_to_dense.get(int(ID)) for ID in line.split()]
i = converted[0]
j = converted[1]
I.append(i)
J.append(j)
n = max([max(I), max(J)]) + 1
data = [1]*len(I)
return coo_matrix((data, (I,J)), shape=(n,n), dtype='i1')
This code is awfully slow -- on may machine conversion of 500k edges takes hours. On the other hand i/o is obviously is not bottleneck (I can read full file in memory almost instantaneously) so I think there is a room for parallelism. But I'm not sure how to proceed: should I read file in parallel or something?
Use multiprocessing one way to do it is this. I did not check and could be further improved
import multiprocessing
class Worker(multiprocessing.Process):
def __init__(self, queue, results):
multiprocessing.Process.__init__(self):
self.q = queue
self.results = results
def run(self):
while True:
try:
lineno, linecontents = self.q.get(block=False)
except Queue.Empty:
break
converted = [sparse_to_dense.get(int(ID)) for ID in line.split()]
i = converted[0]
j = converted[1]
self.results.put((i, j))
def main():
q = multiprocessing.Queue()
results = multiprocessing.JoinableQueue()
for i, l in open(fname):
q.put((i, l))
for _ in xrange(4):
w = Worker(q, results)
w.start()
I, J = []
while True:
try:
i, j = results.get(block=False)
except Queue.Empty:
break
I.append(i)
J.append(j)
results.task_done()
results.join()
n = max([max(I), max(J)]) + 1
data = [1]*len(I)
coo = coo_matrix((data, (I,J)), shape=(n,n), dtype='i1')