Could someone point out what I did wrong with following dask implementation, since it doesnt seems to use the multi cores.
[ Updated with reproducible code]
The code that uses dask :
bookingID = np.arange(1,10000)
book_data = pd.DataFrame(np.random.rand(1000))
def calculate_feature_stats(bookingID):
curr_book_data = book_data
row = list()
row.append(bookingID)
row.append(curr_book_data.min())
row.append(curr_book_data.max())
row.append(curr_book_data.std())
row.append(curr_book_data.mean())
return row
calculate_feature_stats = dask.delayed(calculate_feature_stats)
rows = []
for bookid in bookingID.tolist():
row = calculate_feature_stats(bookid)
rows.append(row)
start = time.time()
rows = dask.persist(*rows)
end = time.time()
print(end - start) # Execution time = 16s in my machine
Code with normal implementation without dask :
bookingID = np.arange(1,10000)
book_data = pd.DataFrame(np.random.rand(1000))
def calculate_feature_stats_normal(bookingID):
curr_book_data = book_data
row = list()
row.append(bookingID)
row.append(curr_book_data.min())
row.append(curr_book_data.max())
row.append(curr_book_data.std())
row.append(curr_book_data.mean())
return row
rows = []
start = time.time()
for bookid in bookingID.tolist():
row = calculate_feature_stats_normal(bookid)
rows.append(row)
end = time.time()
print(end - start) # Execution time = 4s in my machine
So, without dask actually faster, how is that possible?
Answer
Extended comment. You should consider that using dask there is about 1ms overhead (see doc) so if your computation is shorther than that then dask It isn't worth the trouble.
Going to your specific question I can think of two possible real world scenario:
1. A big dataframe with a column called bookingID and another value
2. A different file for every bookingID
In the second case you can play from this answer while for the first case you can proceed as following:
import dask.dataframe as dd
import numpy as np
import pandas as pd
# create dummy df
df = []
for i in range(10_000):
df.append(pd.DataFrame({"id":i,
"value":np.random.rand(1000)}))
df = pd.concat(df, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df.to_parquet("df.parq")
Pandas
%%time
df = pd.read_parquet("df.parq")
out = df.groupby("id").agg({"value":{"min", "max", "std", "mean"}})
out.columns = [col[1] for col in out.columns]
out = out.reset_index(drop=True)
CPU times: user 1.65 s, sys: 316 ms, total: 1.96 s
Wall time: 1.08 s
Dask
%%time
df = dd.read_parquet("df.parq")
out = df.groupby("id").agg({"value":["min", "max", "std", "mean"]}).compute()
out.columns = [col[1] for col in out.columns]
out = out.reset_index(drop=True)
CPU times: user 4.94 s, sys: 427 ms, total: 5.36 s
Wall time: 3.94 s
Final thoughts
In this situation dask starts to make sense if the df doesn't fit in memory.
Related
I have the following for function:
def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (1,51):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)
This for loop takes a long time to calculate the values for the data frame as it has to loop 50 times for each row (it takes approximately 62 seconds)
I tried to use multiprocessor pool from this question. My code looks like this now:
def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (startIndex,endIndex):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)
def main():
dfClosePrice= getFileDataframe().to_frame()
pool = Pool()
time0 = time.time()
result1 = pool.apply_async(calculateEMAs,[dfClosePrice,1,10])
result2 = pool.apply_async(calculateEMAs,[dfClosePrice,10,20])
result3 = pool.apply_async(calculateEMAs,[dfClosePrice,20,30])
result4 = pool.apply_async(calculateEMAs,[dfClosePrice,30,40])
result5 = pool.apply_async(calculateEMAs,[dfClosePrice,40,51])
answer1 = result1.get()
answer2 = result2.get()
answer3 = result3.get()
answer4 = result4.get()
answer5 = result5.get()
print(time.time() - time0)
print(dfClosePrice)
I run the function asynchronously with different values for the for loop. this takes 19 seconds to complete and I can see the result of each function printed correctly but the final value of dfClosePirce is a dataframe with only 1 column (Trade Close) and the new columns from each async function will not be added to the dataframe. How can I do it the right way?
Solution Using Numpy vectorization
Issue
Line if(index-i > 0): should be if(index-i >= 0): otherwise we miss the difference of 1
Use 'Close' rather than 'Trade Close' (doesn't matter for performance but avoid renaming column after pulling data from web)
Code
import numpy as np
import pandas as pd
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorize computation
'''
nrows = len(df)
ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(startIndex, endIndex):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0
# Update result
pair_wise_diff[offset:, offset-startIndex] = diff
# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
# Add result to df merging on index
return df.join(df_result)
Usage
df_result = compute_using_np(df, 1, 51)
Performance
Summary
Posted Code: 37.9 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Numpy Code: 1.56 ms ± 27.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Result: 20K times speed up
Test Code
import pandas_datareader as dr
import pandas as pd
import numpy as np
def calculateEMAs(df, start_index, end_index):
'''
Posted code changed 1) use Python PEP 8 naming convention,
2) corrected conditional
'''
for index,row in df.iterrows():
for i in range (start_index, end_index):
if(index-i >= 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Close"] - df.iloc[index]["Close"])/2 #replace this with EMA formula
return df
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorie computation
'''
nrows = len(df)
ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(start_index, end_index):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0
# Update result
pair_wise_diff[offset:, offset-start_index] = diff
# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
# Add result to df merging on index
return df.join(df_result)
# Get ibm closing stock pricing (777 DataFrame rows)
df = dr.data.get_data_yahoo('ibm', start = '2017-09-01', end = '2020-10-02')
df.reset_index(level=0, inplace = True) # create index which is 0, 1, 2, ...
# Time Original post
df1 = df.copy() # Copy data since operation is inplace
%timeit calculateEMAs(df1, 1, 51) # Jupyter Notebook Magic method
# Time Numpy Version
%timeit compute_using_np(df, 1, 51) # Jupyter Notebook Magic method
# No need to copy since operation is not inplace
I have (900k, 300) records on mongo collection.
When i am trying to read the data to pandas the memory consumption increase dramatically till the process is Killed.
I have to mention that the data is fit to memory(1.5GB~) if i am reading it from csv file.
My machine is 32GB RAM and 16 CPU's Centos 7.
My simple code:
client = MongoClient(host,port)
collection = client[db_name][collection_name]
cursor = collection.find()
df = pd.DataFrame(list(cursor))
My multiprocessing code:
def read_mongo_parallel(skipses):
print("Starting process")
client = MongoClient(skipses[4],skipses[5])
db = client[skipses[2]]
collection = db[skipses[3]]
print("range of {} to {}".format(skipses[0],skipses[0]+skipses[1]))
cursor = collection.find().skip(skipses[0]).limit(skipses[1])
return list(cursor)
all_lists = []
with concurrent.futures.ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
for rows in executor.map(read_mongo_parallel, skipesess):
all_lists.extend(rows)
df = pd.DataFrame(all_lists)
The memory increase in both methods and kill the kernel,
What i am doing worng?
The problem is in the list usage when you build the DataFrame.
The cursor is consumed all at once, making a list with 900k dictionaries inside it, which takes a lot of memory.
You can avoid that if you create an empty DataFrame and then pull the documents in batches, a few documents at a time, appending them to the DataFrame.
def batched(cursor, batch_size):
batch = []
for doc in cursor:
batch.append(doc)
if batch and not len(batch) % batch_size:
yield batch
batch = []
if batch: # last documents
yield batch
df = pd.DataFrame()
for batch in batched(cursor, 10000):
df = df.append(batch, ignore_index=True)
10000 seems like a reasonable batch size, but you may want to change it according to your memory constraints: the higher it is, the faster this will end, but also the more memory it will use while running.
UPDATE: Add some benchmark
Note that this approach does not necessary make the query last longer
but rather the opposite, as what actually takes time is the process
of pulling the documents out of mongodb as dictionaries and allocating
them into a list.
Here are some benchmarks with a 300K documents that show how this
approach, with the right batch_size is actually even faster than pulling
the whole cursor into a list:
The whole cursor into a list
%%time
df = pd.DataFrame(list(db.test.find().limit(300000)))
CPU times: user 35.3 s, sys: 2.14 s, total: 37.5 s
Wall time: 37.7 s
batch_size=10000 <- FASTEST
%%time
df = pd.DataFrame()
for batch in batched(db.test.find().limit(300000), 10000):
df = df.append(batch, ignore_index=True)
CPU times: user 29.5 s, sys: 1.23 s, total: 30.7 s
Wall time: 30.8 s
batch_size=1000
%%time
df = pd.DataFrame()
for batch in batched(db.test.find().limit(300000), 1000):
df = df.append(batch, ignore_index=True)
CPU times: user 44.8 s, sys: 2.09 s, total: 46.9 s
Wall time: 46.9 s
batch_size=100000
%%time
df = pd.DataFrame()
for batch in batched(db.test.find().limit(300000), 100000):
df = df.append(batch, ignore_index=True)
CPU times: user 34.6 s, sys: 1.15 s, total: 35.8 s
Wall time: 36 s
This test harness creates 900k (albeit small) records and runs fine on my stock laptop. Give it a try.
import pymongo
import pandas as pd
db = pymongo.MongoClient()['mydatabase']
db.mycollection.drop()
operations = []
for i in range(900000):
operations.append(pymongo.InsertOne({'a': i}))
db.mycollection.bulk_write(operations, ordered=False)
cursor = db.mycollection.find({})
df = pd.DataFrame(list(cursor))
print(df.count())
Load the data in chunks.
Using iterator2dataframes from https://stackoverflow.com/a/39446008/12015722
def iterator2dataframes(iterator, chunk_size: int):
"""Turn an iterator into multiple small pandas.DataFrame
This is a balance between memory and efficiency
"""
records = []
frames = []
for i, record in enumerate(iterator):
records.append(record)
if i % chunk_size == chunk_size - 1:
frames.append(pd.DataFrame(records))
records = []
if records:
frames.append(pd.DataFrame(records))
return pd.concat(frames)
client = MongoClient(host,port)
collection = client[db_name][collection_name]
cursor = collection.find()
df = iterator2dataframes(cursor, 1000)
Just wanted to make y'all aware of pymongoarrow which is officially developed by MongoDB and solves this problem. It can output query results to arrow tables or pandas data frames and is - according to the docs - the preferred way of loading data from mongo into pandas. It sure worked like a charm for me!
You can try to get data from mongodb in chunk using slice index i.e. get 100000 documents at a time from mongodb. Add documents to dataframe and then fetch next 100000 documents and append the data to dataframe.
client = MongoClient(host,port)
collection = client[db_name][collection_name]
maxrows=905679
for i in range(0, maxrows, 100000):
df2 = df2.iloc[0:0]
if (i+100000<maxrows):
cursor = collection.find()[i:i+100000]
else:
cursor = collection.find()[i:maxrows]
df2= pd.DataFrame(list(cursor))
df.append(df2, ignore_index=True)
Refer below link to know more about slice index in mongodb.
https://api.mongodb.com/python/current/api/pymongo/cursor.html
I have found a solution with multiprocessing and its is the fastest
def chunks(collection_size, n_cores=mp.cpu_count()):
""" Return chunks of tuples """
batch_size = round(collection_size/n_cores)
rest = collection_size%batch_size
cumulative = 0
for i in range(n_cores):
cumulative += batch_size
if i == n_cores-1:
yield (batch_size*i,cumulative+rest)
else:
yield (batch_size*i,cumulative)
def parallel_read(skipses,host=HOST, port=PORT):
print('Starting process on range of {} to {}'.format(skipses[0],skipses[1]))
client = MongoClient(host,port)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]
cursor = collection.find({},{ '_id': False } )
_df = pd.DataFrame(list(cursor[skipses[0]:skipses[1]]))
return _df
def read_mongo(colc_size,_workers=mp.cpu_count()):
temp_df = pd.DataFrame()
pool = mp.Pool(processes=_workers)
results = [pool.apply_async(parallel_read, args=(chunk,)) for chunk in chunks(colc_size,n_cores=_workers)]
output = [p.get() for p in results]
temp_df = pd.concat(output)
return temp_df
time_0 = time()
df = read_mongo(get_collection_size())
print("Reading database with {} processes took {}".format(mp.cpu_count(),time()-time_0))
Starting process on range of 0 to 53866
Starting process on range of 323196 to 377062
Starting process on range of 430928 to 484794
Starting process on range of 538660 to 592526
Starting process on range of 377062 to 430928
Starting process on range of 700258 to 754124
Starting process on range of 53866 to 107732
Starting process on range of 484794 to 538660
Starting process on range of 592526 to 646392
Starting process on range of 646392 to 700258
Starting process on range of 215464 to 269330
Starting process on range of 754124 to 807990
Starting process on range of 807990 to 915714
Starting process on range of 107732 to 161598
Starting process on range of 161598 to 215464
Starting process on range of 269330 to 323196
Reading database with 16 processes took 142.64860558509827
With one of the examples above (no multiprocessing)
def iterator2dataframes(iterator, chunk_size: int):
"""Turn an iterator into multiple small pandas.DataFrame
This is a balance between memory and efficiency
"""
records = []
frames = []
for i, record in enumerate(iterator):
records.append(record)
if i % chunk_size == chunk_size - 1:
frames.append(pd.DataFrame(records))
records = []
if records:
frames.append(pd.DataFrame(records))
return pd.concat(frames)
time_0 = time()
cursor = collection.find()
chunk_size = 1000
df = iterator2dataframes(cursor, chunk_size)
print("Reading database with chunksize = {} took {}".format(chunk_size,time()-time_0))
Reading database with chunksize = 10000 took 372.1170778274536
time_0 = time()
cursor = collection.find()
chunk_size = 10000
df = iterator2dataframes(cursor, chunk_size)
print("Reading database with chunksize = {} took {}".format(chunk_size,time()-time_0))
Reading database with chunksize = 10000 took 367.02637577056885
I work with apis that return large pandas dataframes. I'm not aware of a fast way to iterate through the dataframe directly so I cast to a dictionary with to_dict().
After my data is in dictionary form, the performance is fine. However, the to_dict() operation tends to be a performance bottleneck.
I often group columns of the dataframe together to form multi-index and use the 'index' orientation for to_dict(). Not sure if the large multi-index drives the poor performance.
Is there a faster way to cast a pandas dataframe? Maybe there is a better way to iterate directly over the dataframe without any cast? Not sure if there is a way I could apply vectorization.
Below I give sample code which mimics the issue with timings:
import pandas as pd
import random as rd
import time
#Given a dataframe from api (model as random numbers)
df_columns = ['A','B','C','D','F','G','H','I']
dict_origin = {col:[rd.randint(0,10) for x in range(0,1000)] for col in df_columns}
dict_origin = pd.DataFrame(dict_origin)
#Transform to pivot table
t0 = time.time()
df_pivot = pd.pivot_table(dict_origin,values=df_columns[-3:],index=df_columns[:-3])
t1 = time.time()
print('Pivot Construction takes: ' + str(t1-t0))
#Iterate over all elements in pivot table
t0 = time.time()
for column in df_pivot.columns:
for row in df_pivot[column].index:
test = df_pivot[column].loc[row]
t1 = time.time()
print('Dataframe iteration takes: ' + str(t1-t0))
#Iteration over dataframe too slow. Cast to dictionary (bottleneck)
t0 = time.time()
df_pivot = df_pivot.to_dict('index')
t1 = time.time()
print('Cast to dictionary takes: ' + str(t1-t0))
#Iteration over dictionary is much faster
t0 = time.time()
for row in df_pivot.keys():
for column in df_pivot[row]:
test = df_pivot[row][column]
t1 = time.time()
print('Iteration over dictionary takes: ' + str(t1-t0))
Thank you!
The common guidance is don't iterate, use functions on all rows columns, or grouped rows/columns. Below, in the third code block shows how to iterate over the numpy array whhich is the .values attribute. The results are:
Pivot Construction takes: 0.012315988540649414
Dataframe iteration takes: 0.32346272468566895
Iteration over values takes: 0.004369020462036133
Cast to dictionary takes: 0.023524761199951172
Iteration over dictionary takes: 0.0010480880737304688
import pandas as pd
from io import StringIO
# Test data
import pandas as pd
import random as rd
import time
#Given a dataframe from api (model as random numbers)
df_columns = ['A','B','C','D','F','G','H','I']
dict_origin = {col:[rd.randint(0,10) for x in range(0,1000)] for col in df_columns}
dict_origin = pd.DataFrame(dict_origin)
#Transform to pivot table
t0 = time.time()
df_pivot = pd.pivot_table(dict_origin,values=df_columns[-3:],index=df_columns[:-3])
t1 = time.time()
print('Pivot Construction takes: ' + str(t1-t0))
#Iterate over all elements in pivot table
t0 = time.time()
for column in df_pivot.columns:
for row in df_pivot[column].index:
test = df_pivot[column].loc[row]
t1 = time.time()
print('Dataframe iteration takes: ' + str(t1-t0))
#Iterate over all values in pivot table
t0 = time.time()
v = df_pivot.values
for row in range(df_pivot.shape[0]):
for column in range(df_pivot.shape[1]):
test = v[row, column]
t1 = time.time()
print('Iteration over values takes: ' + str(t1-t0))
#Iteration over dataframe too slow. Cast to dictionary (bottleneck)
t0 = time.time()
df_pivot = df_pivot.to_dict('index')
t1 = time.time()
print('Cast to dictionary takes: ' + str(t1-t0))
#Iteration over dictionary is much faster
t0 = time.time()
for row in df_pivot.keys():
for column in df_pivot[row]:
test = df_pivot[row][column]
t1 = time.time()
print('Iteration over dictionary takes: ' + str(t1-t0))
I'm working on pandas for high performance calculations, the below function gives 1 loop, best of 5: 7.24 s per loop for 50,000 rows.
I have to scale it to 1 million rows.
How to vectorise the function and apply to all rows. So that overall performance can be improved?
def weightedFlowAmt(startDate,endDate,tradeDate,tradeAmt):
startInDays = datetime.strptime(startDate, "%Y-%m-%d")
endInDays = datetime.strptime(endDate, "%Y-%m-%d")
tradeInDays = datetime.strptime(tradeDate, "%Y-%m-%d")
differenceTradeAndEnd=abs((endInDays - tradeInDays).days)
differenceStartAndEnd=abs((endInDays - startInDays).days)
weighted_FlowAmt = (tradeAmt * differenceTradeAndEnd)/differenceStartAndEnd
mutatedCashFlow['flow'] = mutatedCashFlow.apply(lambda row:
weightedFlowAmt(row['startDate'], row['EndDate'], row['tradeDate'],
row['tradeAmount']),
axis=1)
I think you can remove apply and use vectorized functions:
mutatedCashFlow['startDate'] = pd.to_datetime(mutatedCashFlow['startDate'])
mutatedCashFlow['EndDate'] = pd.to_datetime(mutatedCashFlow['EndDate'])
mutatedCashFlow['tradeDate'] = pd.to_datetime(mutatedCashFlow['tradeDate'])
diffTradeAndEnd=((mutatedCashFlow['EndDate']-mutatedCashFlow['tradeDate']).dt.days).abs()
diffStartAndEnd=((mutatedCashFlow['EndDate']-mutatedCashFlow['startDate']).dt.days).abs()
mutatedCashFlow['flow'] = (mutatedCashFlow['tradeAmount']*diffTradeAndEnd)/diffStartAndEnd
Alternative:
mutatedCashFlow['startDate'] = pd.to_datetime(mutatedCashFlow['startDate'])
mutatedCashFlow['EndDate'] = pd.to_datetime(mutatedCashFlow['EndDate'])
mutatedCashFlow['tradeDate'] = pd.to_datetime(mutatedCashFlow['tradeDate'])
diffTradeAndEnd=mutatedCashFlow['EndDate'].sub(mutatedCashFlow['tradeDate']).dt.days.abs()
diffStartAndEnd=mutatedCashFlow['EndDate'].sub(mutatedCashFlow['startDate']).dt.days.abs()
mutatedCashFlow['flow'] = mutatedCashFlow['tradeAmount'].mul(diffTradeAndEnd)
.div(diffStartAndEnd)
print (mutatedCashFlow)
I recently found dask module that aims to be an easy-to-use python parallel processing module. Big selling point for me is that it works with pandas.
After reading a bit on its manual page, I can't find a way to do this trivially parallelizable task:
ts.apply(func) # for pandas series
df.apply(func, axis = 1) # for pandas DF row apply
At the moment, to achieve this in dask, AFAIK,
ddf.assign(A=lambda df: df.apply(func, axis=1)).compute() # dask DataFrame
which is ugly syntax and is actually slower than outright
df.apply(func, axis = 1) # for pandas DF row apply
Any suggestion?
Edit: Thanks #MRocklin for the map function. It seems to be slower than plain pandas apply. Is this related to pandas GIL releasing issue or am I doing it wrong?
import dask.dataframe as dd
s = pd.Series([10000]*120)
ds = dd.from_pandas(s, npartitions = 3)
def slow_func(k):
A = np.random.normal(size = k) # k = 10000
s = 0
for a in A:
if a > 0:
s += 1
else:
s -= 1
return s
s.apply(slow_func) # 0.43 sec
ds.map(slow_func).compute() # 2.04 sec
map_partitions
You can apply your function to all of the partitions of your dataframe with the map_partitions function.
df.map_partitions(func, columns=...)
Note that func will be given only part of the dataset at a time, not the entire dataset like with pandas apply (which presumably you wouldn't want if you want to do parallelism.)
map / apply
You can map a function row-wise across a series with map
df.mycolumn.map(func)
You can map a function row-wise across a dataframe with apply
df.apply(func, axis=1)
Threads vs Processes
As of version 0.6.0 dask.dataframes parallelizes with threads. Custom Python functions will not receive much benefit from thread-based parallelism. You could try processes instead
df = dd.read_csv(...)
df.map_partitions(func, columns=...).compute(scheduler='processes')
But avoid apply
However, you should really avoid apply with custom Python functions, both in Pandas and in Dask. This is often a source of poor performance. It could be that if you find a way to do your operation in a vectorized manner then it could be that your Pandas code will be 100x faster and you won't need dask.dataframe at all.
Consider numba
For your particular problem you might consider numba. This significantly improves your performance.
In [1]: import numpy as np
In [2]: import pandas as pd
In [3]: s = pd.Series([10000]*120)
In [4]: %paste
def slow_func(k):
A = np.random.normal(size = k) # k = 10000
s = 0
for a in A:
if a > 0:
s += 1
else:
s -= 1
return s
## -- End pasted text --
In [5]: %time _ = s.apply(slow_func)
CPU times: user 345 ms, sys: 3.28 ms, total: 348 ms
Wall time: 347 ms
In [6]: import numba
In [7]: fast_func = numba.jit(slow_func)
In [8]: %time _ = s.apply(fast_func) # First time incurs compilation overhead
CPU times: user 179 ms, sys: 0 ns, total: 179 ms
Wall time: 175 ms
In [9]: %time _ = s.apply(fast_func) # Subsequent times are all gain
CPU times: user 68.8 ms, sys: 27 µs, total: 68.8 ms
Wall time: 68.7 ms
Disclaimer, I work for the company that makes both numba and dask and employs many of the pandas developers.
As of v dask.dataframe.apply delegates responsibility to map_partitions:
#insert_meta_param_description(pad=12)
def apply(self, func, convert_dtype=True, meta=no_default, args=(), **kwds):
""" Parallel version of pandas.Series.apply
...
"""
if meta is no_default:
msg = ("`meta` is not specified, inferred from partial data. "
"Please provide `meta` if the result is unexpected.\n"
" Before: .apply(func)\n"
" After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n"
" or: .apply(func, meta=('x', 'f8')) for series result")
warnings.warn(msg)
meta = _emulate(M.apply, self._meta_nonempty, func,
convert_dtype=convert_dtype,
args=args, **kwds)
return map_partitions(M.apply, self, func,
convert_dtype, args, meta=meta, **kwds)