I have the following for function:
def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (1,51):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)
This for loop takes a long time to calculate the values for the data frame as it has to loop 50 times for each row (it takes approximately 62 seconds)
I tried to use multiprocessor pool from this question. My code looks like this now:
def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (startIndex,endIndex):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)
def main():
dfClosePrice= getFileDataframe().to_frame()
pool = Pool()
time0 = time.time()
result1 = pool.apply_async(calculateEMAs,[dfClosePrice,1,10])
result2 = pool.apply_async(calculateEMAs,[dfClosePrice,10,20])
result3 = pool.apply_async(calculateEMAs,[dfClosePrice,20,30])
result4 = pool.apply_async(calculateEMAs,[dfClosePrice,30,40])
result5 = pool.apply_async(calculateEMAs,[dfClosePrice,40,51])
answer1 = result1.get()
answer2 = result2.get()
answer3 = result3.get()
answer4 = result4.get()
answer5 = result5.get()
print(time.time() - time0)
print(dfClosePrice)
I run the function asynchronously with different values for the for loop. this takes 19 seconds to complete and I can see the result of each function printed correctly but the final value of dfClosePirce is a dataframe with only 1 column (Trade Close) and the new columns from each async function will not be added to the dataframe. How can I do it the right way?
Solution Using Numpy vectorization
Issue
Line if(index-i > 0): should be if(index-i >= 0): otherwise we miss the difference of 1
Use 'Close' rather than 'Trade Close' (doesn't matter for performance but avoid renaming column after pulling data from web)
Code
import numpy as np
import pandas as pd
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorize computation
'''
nrows = len(df)
ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(startIndex, endIndex):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0
# Update result
pair_wise_diff[offset:, offset-startIndex] = diff
# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
# Add result to df merging on index
return df.join(df_result)
Usage
df_result = compute_using_np(df, 1, 51)
Performance
Summary
Posted Code: 37.9 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Numpy Code: 1.56 ms ± 27.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Result: 20K times speed up
Test Code
import pandas_datareader as dr
import pandas as pd
import numpy as np
def calculateEMAs(df, start_index, end_index):
'''
Posted code changed 1) use Python PEP 8 naming convention,
2) corrected conditional
'''
for index,row in df.iterrows():
for i in range (start_index, end_index):
if(index-i >= 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Close"] - df.iloc[index]["Close"])/2 #replace this with EMA formula
return df
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorie computation
'''
nrows = len(df)
ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(start_index, end_index):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0
# Update result
pair_wise_diff[offset:, offset-start_index] = diff
# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
# Add result to df merging on index
return df.join(df_result)
# Get ibm closing stock pricing (777 DataFrame rows)
df = dr.data.get_data_yahoo('ibm', start = '2017-09-01', end = '2020-10-02')
df.reset_index(level=0, inplace = True) # create index which is 0, 1, 2, ...
# Time Original post
df1 = df.copy() # Copy data since operation is inplace
%timeit calculateEMAs(df1, 1, 51) # Jupyter Notebook Magic method
# Time Numpy Version
%timeit compute_using_np(df, 1, 51) # Jupyter Notebook Magic method
# No need to copy since operation is not inplace
Related
I have a dataframe which contains millions of entries and looks something like this:
Chr
Start
Alt
1
21651521
A
1
41681521
T
1
41681521
T
...
...
...
X
423565
T
I am currently trying to count the number of rows that match several conditions at the same time, i.e. Chr==1, Start==41681521 and Alt==T.
Right now I am using this syntax, which works fine, but seems unpythonic and is also rather slow I think.
num_occurrence = sum((df["Chr"] == chrom) &
(df["Start"] == int(position)) &
(df["Alt"] == allele))
Does anyone have an approach which is more suitable then mine?
Any help is much appreciated!
Cheers!
Alternative 1: pd.DataFrame.query()
You could work with query (see also the illustrative examples here):
expr = "Chr=={chr} & Start=={pos} & Alt=='{alt}'"
ret = df.query(expr.format(chr=chrom, pos=int(position), alt=allele))
In my experiments, this led already to a considerable speedup.
Optimizing this further requires additional information about the data types involved. There are several things you could try:
Alternative 2: Query sorted data
If you can afford to sort your DataFrame prior to querying, you can use pd.Series.searchsorted(). Here is a possible approach:
def query_sorted(df, chrom, position, allele):
"""
Returns index of the matches.
"""
assert df["Start"].is_monotonic_increasing
i_min, i_max = df["Start"].searchsorted([position, position+1])
df = df.iloc[i_min:i_max]
return df[(df["Chr"] == chrom) & (df["Alt"] == allele)].index
# Usage: first sort df by column "Start", then query:
df = df.sort_values("Start")
ret_index = query_sorted(df, chrom, position, allele)
print(len(ret_index))
Alternative 3: Use hashes
Another idea would be to use hashes. Again, this requires some calculations up front, but it speeds up the query considerably. Here is an example based on pd.util.hash_pandas_object():
def query_hash(df, chrom, position, allele):
"""
Returns a view on df
"""
assert "hash" in df
dummy = pd.DataFrame([[chrom, position, allele]])
query_hash = pd.util.hash_pandas_object(dummy, index=False).squeeze()
return df[df["hash"] == query_hash].index
# Usage: first compute hashes over the columns of interest, then query
df["hash"] = pd.util.hash_pandas_object(df[["Chr", "Start", "Alt"]],
index=False)
ret_index = query_hash(df, chrom, position, allele)
print(len(ret_index))
Alternative 4: Use a multi-index
Pandas also operates with hashes when accessing rows via the index. Thus, instead of calculating hashes explicitly, as in the previous alternative, one could simply set the index of the DataFrame prior to querying. (Since setting all columns as index would result in an empty DataFrame, I first create a dummy column. For a real DataFrame with additional columns this will probably not be necessary.)
df["dummy"] = None
df = df.set_index(["Chr", "Start", "Alt"])
df = df.sort_index() # Improves performance
print(len(df.loc[(chrom, position, allele)])
# Interestingly, chaining .loc[] is about twice as fast
print(len(df.loc[chrom].loc[position].loc[allele]))
Note that using an index where one index value maps to many records is not always a good idea. Also, this approach is slower than alternative 3, indicating that Pandas does some extra work here.
There are certainly many more ways to improve this, though the alternative approaches will depend on your specific needs.
Results
I tested with n=10M samples on a MacBook Pro (Mid 2015), running Python 3.8, Pandas 1.2.4 and IPython 7.24.1. Note that the performance evaluation depends on the problem size. The relative assessment of the methods therefore will change for different problem sizes.
# original (sum(s)): 1642.0 ms ± 19.1 ms
# original (s.sum()): 639.0 ms ± 21.9 ms
# query(): 175.0 ms ± 1.1 ms
# query_sorted(): 17.5 ms ± 60.4 µs
# query-hash(): 10.6 ms ± 62.5 µs
# multi-index: 71.5 ms ± 0.7 ms
# multi-index (seq.): 36.5 ms ± 0.6 ms
Implementation
This is how I constructed the data and compared the different approaches.
import numpy as np
import pandas as pd
# Create test data
n = int(10*1e6)
df = pd.DataFrame({"Chr": np.random.randint(1,23+1,n),
"Start": np.random.randint(100,999, n),
"Alt": np.random.choice(list("ACTG"), n)})
# Query point
chrom, position, allele = 1, 142, "A"
# Create test data
n = 10000000
df = pd.DataFrame({"Chr": np.random.randint(1,23+1,n),
"Start": np.random.randint(100,999, n),
"Alt": np.random.choice(list("ACTG"), n)})
# Query point
chrom, position, allele = 1, 142, "A"
# Measure performance in IPython
print("original (sum(s)):")
%timeit sum((df["Chr"] == chrom) & \
(df["Start"] == int(position)) & \
(df["Alt"] == allele))
print("original (s.sum()):")
%timeit ((df["Chr"] == chrom) & \
(df["Start"] == int(position)) & \
(df["Alt"] == allele)).sum()
print("query():")
%timeit len(df.query(expr.format(chr=chrom, \
pos=position, \
alt=allele)))
print("query_sorted():")
df_sorted = df.sort_values("Start")
%timeit query_sorted(df_sorted, chrom, position, allele)
print("query-hash():")
df_hash = df.copy()
df_hash["hash"] = pd.util.hash_pandas_object(df_hash[["Chr", "Start", "Alt"]],
index=False)
%timeit query_hash(df_hash, chrom, position, allele)
print("multi-index:")
df_multi = df.copy()
df_multi["dummy"] = None
df_multi = df_multi.set_index(["Chr", "Start", "Alt"]).sort_index()
%timeit df_multi.loc[(chrom, position, allele)]
print("multi-index (seq.):")
%timeit len(df_multi.loc[chrom].loc[position].loc[allele])
Use DataFrame.all + Series.sum:
res = (df[["Chr", "Start", "Alt"]] == [chrom, int(position), allele]).all(1).sum()
For example:
import pandas as pd
# toy data
df = pd.DataFrame(data=[[1, 21651521, "A"], [1, 41681521, "T"], [1, 41681521, "T"]], columns=["Chr", "Start", "Alt"])
chrom, position, allele = 1, "21651521", "A"
res = (df[["Chr", "Start", "Alt"]] == [chrom, int(position), allele]).all(1).sum()
print(res)
Output
1
Could someone point out what I did wrong with following dask implementation, since it doesnt seems to use the multi cores.
[ Updated with reproducible code]
The code that uses dask :
bookingID = np.arange(1,10000)
book_data = pd.DataFrame(np.random.rand(1000))
def calculate_feature_stats(bookingID):
curr_book_data = book_data
row = list()
row.append(bookingID)
row.append(curr_book_data.min())
row.append(curr_book_data.max())
row.append(curr_book_data.std())
row.append(curr_book_data.mean())
return row
calculate_feature_stats = dask.delayed(calculate_feature_stats)
rows = []
for bookid in bookingID.tolist():
row = calculate_feature_stats(bookid)
rows.append(row)
start = time.time()
rows = dask.persist(*rows)
end = time.time()
print(end - start) # Execution time = 16s in my machine
Code with normal implementation without dask :
bookingID = np.arange(1,10000)
book_data = pd.DataFrame(np.random.rand(1000))
def calculate_feature_stats_normal(bookingID):
curr_book_data = book_data
row = list()
row.append(bookingID)
row.append(curr_book_data.min())
row.append(curr_book_data.max())
row.append(curr_book_data.std())
row.append(curr_book_data.mean())
return row
rows = []
start = time.time()
for bookid in bookingID.tolist():
row = calculate_feature_stats_normal(bookid)
rows.append(row)
end = time.time()
print(end - start) # Execution time = 4s in my machine
So, without dask actually faster, how is that possible?
Answer
Extended comment. You should consider that using dask there is about 1ms overhead (see doc) so if your computation is shorther than that then dask It isn't worth the trouble.
Going to your specific question I can think of two possible real world scenario:
1. A big dataframe with a column called bookingID and another value
2. A different file for every bookingID
In the second case you can play from this answer while for the first case you can proceed as following:
import dask.dataframe as dd
import numpy as np
import pandas as pd
# create dummy df
df = []
for i in range(10_000):
df.append(pd.DataFrame({"id":i,
"value":np.random.rand(1000)}))
df = pd.concat(df, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df.to_parquet("df.parq")
Pandas
%%time
df = pd.read_parquet("df.parq")
out = df.groupby("id").agg({"value":{"min", "max", "std", "mean"}})
out.columns = [col[1] for col in out.columns]
out = out.reset_index(drop=True)
CPU times: user 1.65 s, sys: 316 ms, total: 1.96 s
Wall time: 1.08 s
Dask
%%time
df = dd.read_parquet("df.parq")
out = df.groupby("id").agg({"value":["min", "max", "std", "mean"]}).compute()
out.columns = [col[1] for col in out.columns]
out = out.reset_index(drop=True)
CPU times: user 4.94 s, sys: 427 ms, total: 5.36 s
Wall time: 3.94 s
Final thoughts
In this situation dask starts to make sense if the df doesn't fit in memory.
I have millions of records with each record having an integer (p) and a X*3 matrix of values. For each record, the goal is to find a row from the matrix by selection criteria (see the if-statements in the code).
I'm fairly new to Python and try to make use of vectorization in Pandas using parallel computations instead of loops. I have written the program in two versions, one with Pandas+Numpy and another one with simple loops.
I was told that using vectorization and Numpy array operations is faster than loops. But so far, the loop version is about 10x faster:
Here is the program:
import numpy as np
import pandas as pd
import time
d = {
'values': [np.array([[1400,1400,1800000],[1500,1505,4800000],[1300,1305,5000]]), np.array([[800,900,80000],[1400,1420,50000],[1250,1300,60000]]), np.array([[1700,1750,5000000],[1900,1950,5000000],[1600,1600,3000000]]), np.array([])],
'p': [1300, 1350, 1800, 1400]
}
# The Pandas+Numpy version
def selection_numpy(row):
try:
# Select rows where col[0] >= p
c1 = row['values'][row['values'][:,0] >= row['p']]
# Select rows where col[2] > 1000000
c2 = c1[c1[:,2]>1000000]
# Sort by col[0] and return the lowest row
return c2[c2[:,0].argsort()][0]
except:
pass
start = time.time()
df = pd.DataFrame(d)
df['result'] = df.apply(selection_numpy, axis=1)
# print(df.head())
print(time.time()-start)
# The loop version:
def selection_loop(values, p):
lowest_num = 9999999999
lowest_item = None
# Iterate through each row in the matrix and replace lowest_item if it's lower than the previous one
for item in values:
if item[0] >= p and item[2] > 1000000 and item[0] < lowest_num:
lowest_num = item[0]
lowest_item = item
return lowest_item
start = time.time()
d['result'] = []
for i in range(0, 4):
result = selection_loop(d['values'][i], d['p'][i])
d['result'].append(result)
# print(d['result'])
print(time.time()-start)
Both produce the same result values, but the loop version is magnitudes faster (for the actual million record dataset, not for the 4 example records).
I assume there is a simple and elegant solution to find the desired row for each record which uses vectorization and is the fastest. Not sure why the function using Numpy arrays is so slow, but I appreciate any guidance.
I'm selecting one max row per group and I'm using groupby/agg to return index values and select the rows using loc.
For example, to group by "Id" and then select the row with the highest "delta" value:
selected_idx = df.groupby("Id").apply(lambda df: df.delta.argmax())
selected_rows = df.loc[selected_idx, :]
However, it's so slow this way. Actually, my i7/16G RAM laptop hangs when I'm using this query on 13 million rows.
I have two questions for experts:
How can I make this query run fast in pandas? What am I doing wrong?
Why is this operation so expensive?
[Update]
Thank you so much for #unutbu 's analysis!
sort_drop it is! On my i7/32GRAM machine, groupby+idxmax hangs for nearly 14 hours (never return a thing) however sort_drop handled it LESS THAN A MINUTE!
I still need to look at how pandas implements each method but problems solved for now! I love StackOverflow.
The fastest option depends not only on length of the DataFrame (in this case, around 13M rows) but also on the number of groups. Below are perfplots which compare a number of ways of finding the maximum in each group:
If there an only a few (large) groups, using_idxmax may be the fastest option:
If there are many (small) groups and the DataFrame is not too large, using_sort_drop may be the fastest option:
Keep in mind, however, that while using_sort_drop, using_sort and using_rank start out looking very fast, as N = len(df) increases, their speed relative to the other options disappears quickly. For large enough N, using_idxmax becomes the fastest option, even if there are many groups.
using_sort_drop, using_sort and using_rank sorts the DataFrame (or groups within the DataFrame). Sorting is O(N * log(N)) on average, while the other methods use O(N) operations. This is why methods like using_idxmax beats using_sort_drop for very large DataFrames.
Be aware that benchmark results may vary for a number of reasons, including machine specs, OS, and software versions. So it is important to run benchmarks on your own machine, and with test data tailored to your situation.
Based on the perfplots above, using_sort_drop may be an option worth considering for your DataFrame of 13M rows, especially if it has many (small) groups. Otherwise, I would suspect using_idxmax to be the fastest option -- but again, it's important that you check benchmarks on your machine.
Here is the setup I used to make the perfplots:
import numpy as np
import pandas as pd
import perfplot
def make_df(N):
# lots of small groups
df = pd.DataFrame(np.random.randint(N//10+1, size=(N, 2)), columns=['Id','delta'])
# few large groups
# df = pd.DataFrame(np.random.randint(10, size=(N, 2)), columns=['Id','delta'])
return df
def using_idxmax(df):
return df.loc[df.groupby("Id")['delta'].idxmax()]
def max_mask(s):
i = np.asarray(s).argmax()
result = [False]*len(s)
result[i] = True
return result
def using_custom_mask(df):
mask = df.groupby("Id")['delta'].transform(max_mask)
return df.loc[mask]
def using_isin(df):
idx = df.groupby("Id")['delta'].idxmax()
mask = df.index.isin(idx)
return df.loc[mask]
def using_sort(df):
df = df.sort_values(by=['delta'], ascending=False, kind='mergesort')
return df.groupby('Id', as_index=False).first()
def using_rank(df):
mask = (df.groupby('Id')['delta'].rank(method='first', ascending=False) == 1)
return df.loc[mask]
def using_sort_drop(df):
# Thanks to jezrael
# https://stackoverflow.com/questions/50381064/select-the-max-row-per-group-pandas-performance-issue/50389889?noredirect=1#comment87795818_50389889
return df.sort_values(by=['delta'], ascending=False, kind='mergesort').drop_duplicates('Id')
def using_apply(df):
selected_idx = df.groupby("Id").apply(lambda df: df.delta.argmax())
return df.loc[selected_idx]
def check(df1, df2):
df1 = df1.sort_values(by=['Id','delta'], kind='mergesort').reset_index(drop=True)
df2 = df2.sort_values(by=['Id','delta'], kind='mergesort').reset_index(drop=True)
return df1.equals(df2)
perfplot.show(
setup=make_df,
kernels=[using_idxmax, using_custom_mask, using_isin, using_sort,
using_rank, using_apply, using_sort_drop],
n_range=[2**k for k in range(2, 20)],
logx=True,
logy=True,
xlabel='len(df)',
repeat=75,
equality_check=check)
Another way to benchmark is to use IPython %timeit:
In [55]: df = make_df(2**20)
In [56]: %timeit using_sort_drop(df)
1 loop, best of 3: 403 ms per loop
In [57]: %timeit using_rank(df)
1 loop, best of 3: 1.04 s per loop
In [58]: %timeit using_idxmax(df)
1 loop, best of 3: 15.8 s per loop
Using Numba's jit
from numba import njit
import numpy as np
#njit
def nidxmax(bins, k, weights):
out = np.zeros(k, np.int64)
trk = np.zeros(k)
for i, w in enumerate(weights - (weights.min() - 1)):
b = bins[i]
if w > trk[b]:
trk[b] = w
out[b] = i
return np.sort(out)
def with_numba_idxmax(df):
f, u = pd.factorize(df.Id)
return df.iloc[nidxmax(f, len(u), df.delta.values)]
Borrowing from #unutbu
def make_df(N):
# lots of small groups
df = pd.DataFrame(np.random.randint(N//10+1, size=(N, 2)), columns=['Id','delta'])
# few large groups
# df = pd.DataFrame(np.random.randint(10, size=(N, 2)), columns=['Id','delta'])
return df
Prime jit
with_numba_idxmax(make_df(10));
Test
df = make_df(2**20)
%timeit with_numba_idxmax(df)
%timeit using_sort_drop(df)
47.4 ms ± 99.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
194 ms ± 451 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
I have a DataFrame of force-displacement data. The displacement array has been set to the DataFrame index, and the columns are my various force curves for different tests.
How do I calculate the work done (which is "the area under the curve")?
I looked at numpy.trapz which seems to do what I need, but I think that I can avoid looping over each column like this:
import numpy as np
import pandas as pd
forces = pd.read_csv(...)
work_done = {}
for col in forces.columns:
work_done[col] = np.trapz(forces.loc[col], forces.index))
I was hoping to create a new DataFrame of the areas under the curves rather than a dict, and thought that DataFrame.apply() or something might be appropriate but don't know where to start looking.
In short:
Can I avoid the looping?
Can I create a DataFrame of work done directly?
Thanks in advance for any help.
You could vectorize this by passing the whole DataFrame to np.trapz and specifying the axis= argument, e.g.:
import numpy as np
import pandas as pd
# some random input data
gen = np.random.RandomState(0)
x = gen.randn(100, 10)
names = [chr(97 + i) for i in range(10)]
forces = pd.DataFrame(x, columns=names)
# vectorized version
wrk = np.trapz(forces, x=forces.index, axis=0)
work_done = pd.DataFrame(wrk[None, :], columns=forces.columns)
# non-vectorized version for comparison
work_done2 = {}
for col in forces.columns:
work_done2.update({col:np.trapz(forces.loc[:, col], forces.index)})
These give the following output:
from pprint import pprint
pprint(work_done.T)
# 0
# a -24.331560
# b -10.347663
# c 4.662212
# d -12.536040
# e -10.276861
# f 3.406740
# g -3.712674
# h -9.508454
# i -1.044931
# j 15.165782
pprint(work_done2)
# {'a': -24.331559643023006,
# 'b': -10.347663159421426,
# 'c': 4.6622123535050459,
# 'd': -12.536039649161403,
# 'e': -10.276861220217308,
# 'f': 3.4067399176289994,
# 'g': -3.7126739591045541,
# 'h': -9.5084536839888187,
# 'i': -1.0449311137294459,
# 'j': 15.165781517623724}
There are a couple of other problems with your original example. col is a column name rather than a row index, so it needs to index the second dimension of your dataframe (i.e. .loc[:, col] rather than .loc[col]). Also, you have an extra trailing parenthesis on the last line.
Edit:
You could also generate the output DataFrame directly by .applying np.trapz to each column, e.g.:
work_done = forces.apply(np.trapz, axis=0, args=(forces.index,))
However, this isn't really 'proper' vectorization - you are still calling np.trapz separately on each column. You can see this by comparing the speed of the .apply version against calling np.trapz directly:
In [1]: %timeit forces.apply(np.trapz, axis=0, args=(forces.index,))
1000 loops, best of 3: 582 µs per loop
In [2]: %timeit np.trapz(forces, x=forces.index, axis=0)
The slowest run took 6.04 times longer than the fastest. This could mean that an
intermediate result is being cached
10000 loops, best of 3: 53.4 µs per loop
This isn't an entirely fair comparison, since the second version excludes the extra time taken to construct the DataFrame from the output numpy array, but this should still be smaller than the difference in time taken to perform the actual integration.
Here's how to get the cumulative integral along a dataframe column using the trapezoidal rule. Alternatively, the following creates a pandas.Series method for doing your choice of Trapezoidal, Simpson's or Romberger's rule (source):
import pandas as pd
from scipy import integrate
import numpy as np
#%% Setup Functions
def integrate_method(self, how='trapz', unit='s'):
'''Numerically integrate the time series.
#param how: the method to use (trapz by default)
#return
Available methods:
* trapz - trapezoidal
* cumtrapz - cumulative trapezoidal
* simps - Simpson's rule
* romb - Romberger's rule
See http://docs.scipy.org/doc/scipy/reference/integrate.html for the method details.
or the source code
https://github.com/scipy/scipy/blob/master/scipy/integrate/quadrature.py
'''
available_rules = set(['trapz', 'cumtrapz', 'simps', 'romb'])
if how in available_rules:
rule = integrate.__getattribute__(how)
else:
print('Unsupported integration rule: %s' % (how))
print('Expecting one of these sample-based integration rules: %s' % (str(list(available_rules))))
raise AttributeError
if how is 'cumtrapz':
result = rule(self.values)
result = np.insert(result, 0, 0, axis=0)
else:
result = rule(self.values)
return result
pd.Series.integrate = integrate_method
#%% Setup (random) data
gen = np.random.RandomState(0)
x = gen.randn(100, 10)
names = [chr(97 + i) for i in range(10)]
df = pd.DataFrame(x, columns=names)
#%% Cummulative Integral
df_cummulative_integral = df.apply(lambda x: x.integrate('cumtrapz'))
df_integral = df.apply(lambda x: x.integrate('trapz'))
df_do_they_match = df_cummulative_integral.tail(1).round(3) == df_integral.round(3)
if df_do_they_match.all().all():
print("Trapz produces the last row of cumtrapz")