I'm having a large multindexed (y,t) single valued DataFrame df. Currently, I'm selecting a subset via df.loc[(Y,T), :] and create a dictionary out of it. The following MWE works, but the selection is very slow for large subsets.
import numpy as np
import pandas as pd
# Full DataFrame
y_max = 50
Y_max = range(1, y_max+1)
t_max = 100
T_max = range(1, t_max+1)
idx_max = tuple((y,t) for y in Y_max for t in T_max)
df = pd.DataFrame(np.random.sample(y_max*t_max), index=idx_max, columns=['Value'])
# Create Dictionary of Subset of Data
y1 = 4
yN = 10
Y = range(y1, yN+1)
t1 = 5
tN = 9
T = range(t1, tN+1)
idx_sub = tuple((y,t) for y in Y for t in T)
data_sub = df.loc[(Y,T), :] #This is really slow
dict_sub = dict(zip(idx_sub, data_sub['Value']))
# result, e.g. (y,t) = (5,7)
dict_sub[5,7] == df.loc[(5,7), 'Value']
I was thinking of using df.loc[(y1,t1),(yN,tN), :], but it does not work properly, as the second index is only bounded in the final year yN.
One idea is use Index.isin with itertools.product in boolean indexing:
from itertools import product
idx_sub = tuple(product(Y, T))
dict_sub = df.loc[df.index.isin(idx_sub),'Value'].to_dict()
print (dict_sub)
So I have a dataframe df_hist that I'm sampling a row by a group pat_mrn_id. It's pretty darn slow and there's got to be a vectorized way of doing this.
Code example below
import random
import numpy as np
import pandas as pd
N = int(1e8)
A_list = np.random.randint(1, 100, N)
B_list = np.random.randint(1, 100, N)
mrns = [random.randint(0,1000) for i in range(N)]
d = {'pat_mrn_id':mrns,'a_list':A_list,'b_list':B_list}
df_hist = pd.DataFrame(data=d)
df_hist.groupby('pat_mrn_id').apply(lambda x: x.sample(1)).reset_index(drop=True)
I import a csv into a dataframe and got a series like this:
In[1]: A = df["data1"]
B = df["data2"]
type(A)
Out[1]: pandas.core.series.Series
I make a pearson module like this
def pearson(vector1, vector2):
n = len(vector1)
# simple sums
sum1 = sum(float(vector1[i]) for i in range(n))
sum2 = sum(float(vector2[i]) for i in range(n))
# sum up the squares
sum1_pow = sum([pow(v, 2.0) for v in vector1])
sum2_pow = sum([pow(v, 2.0) for v in vector2])
# sum up the products
p_sum = sum([vector1[i] * vector2[i] for i in range(n)])
num = p_sum - (sum1*sum2/n)
den =((sum1_pow-pow(sum1, 2)/n) * (sum2_pow-pow(sum2, 2)/n)) ** 0.5
if den == 0:
return 0.0
return num/den
And I want to use as_matrix to convert the series to a numpy array and it return a method not a numpy array, How did I get a numpy array from
Series?
from modulas import pearson1
import numpy as np
An = A.as_matrix
Bn = B.as_matrix
p = pearson(An, Bn)
TypeError: 'module' object is not callable
How to covert a series into a numpy array?
Use values:
series = pd.Series([1, 2, 3], name="a")
series.values
# => array([1, 2, 3])
change the code to:
An=A.as_matrix() ... you have to call the method in order for it to perform its' function on the pandas series
as #Mad Physicist mentioned, you can use a pandas series in place of a np array most of the time anyways
you can also do
An = A.values
I believe as_matrix will be replaced by values in future version of pandas
The task by example:
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
idx = np.array([2, 0, 1, 1, 2, 0, 1, 1, 2])
Expected result:
binned = np.array([2, 6, 3, 4, 7, 8, 1, 5, 9])
Constraints:
Should be fast.
Should be O(n+k) where n is the length of data and k is the number of bins.
Should be stable, i.e. order within bins is preserved.
Obvious solution
data[np.argsort(idx, kind='stable')]
is O(n log n).
O(n+k) solution
def sort_to_bins(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
cnts = np.zeros(mx + 1, int)
for i in range(idx.size):
cnts[idx[i] + 1] += 1
for i in range(1, cnts.size):
cnts[i] += cnts[i-1]
res = np.empty_like(data)
for i in range(data.size):
res[cnts[idx[i]]] = data[i]
cnts[idx[i]] += 1
return res
is loopy and slow.
Is there a better method in pure numpy < scipy < pandas < numba/pythran?
Here are a few solutions:
Use np.argsort anyway, after all it is fast compiled code.
Use np.bincount to get the bin sizes and np.argpartition which is O(n) for fixed number of bins. Downside: currently, no stable algorithm is available, thus we have to sort each bin.
Use scipy.ndimage.measurements.labeled_comprehension. This does roughly what is required, but no idea how it is implemented.
Use pandas. I'm a complete pandas noob, so what I cobbled together here using groupby may be suboptimal.
Use scipy.sparse switching between compressed sparse row and compressed sparse column formats happens to implement the exact operation we are looking for.
Use pythran (I'm sure numba works as well) on the loopy code in the question. All that is required is to insert at the top after numpy import
.
#pythran export sort_to_bins(int[:], float[:], int)
and then compile
# pythran stb_pthr.py
Benchmarks 100 bins, variable number of items:
Take home:
If you are ok with numba/pythran that is the way to go, if not scipy.sparse scales rather well.
Code:
import numpy as np
from scipy import sparse
from scipy.ndimage.measurements import labeled_comprehension
from stb_pthr import sort_to_bins as sort_to_bins_pythran
import pandas as pd
def sort_to_bins_pandas(idx, data, mx=-1):
df = pd.DataFrame.from_dict(data=data)
out = np.empty_like(data)
j = 0
for grp in df.groupby(idx).groups.values():
out[j:j+len(grp)] = data[np.sort(grp)]
j += len(grp)
return out
def sort_to_bins_ndimage(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
out = np.empty_like(data)
j = 0
def collect(bin):
nonlocal j
out[j:j+len(bin)] = np.sort(bin)
j += len(bin)
return 0
labeled_comprehension(data, idx, np.arange(mx), collect, data.dtype, None)
return out
def sort_to_bins_partition(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
return data[np.argpartition(idx, np.bincount(idx, None, mx)[:-1].cumsum())]
def sort_to_bins_partition_stable(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
split = np.bincount(idx, None, mx)[:-1].cumsum()
srt = np.argpartition(idx, split)
for bin in np.split(srt, split):
bin.sort()
return data[srt]
def sort_to_bins_sparse(idx, data, mx=-1):
if mx==-1:
mx = idx.max() + 1
return sparse.csr_matrix((data, idx, np.arange(len(idx)+1)), (len(idx), mx)).tocsc().data
def sort_to_bins_argsort(idx, data, mx=-1):
return data[idx.argsort(kind='stable')]
from timeit import timeit
exmpls = [np.random.randint(0, K, (N,)) for K, N in np.c_[np.full(16, 100), 1<<np.arange(5, 21)]]
timings = {}
for idx in exmpls:
data = np.arange(len(idx), dtype=float)
ref = None
for x, f in (*globals().items(),):
if x.startswith('sort_to_bins_'):
timings.setdefault(x.replace('sort_to_bins_', '').replace('_', ' '), []).append(timeit('f(idx, data, -1)', globals={'f':f, 'idx':idx, 'data':data}, number=10)*100)
if x=='sort_to_bins_partition':
continue
if ref is None:
ref = f(idx, data, -1)
else:
assert np.all(f(idx, data, -1)==ref)
import pylab
for k, v in timings.items():
pylab.loglog(1<<np.arange(5, 21), v, label=k)
pylab.xlabel('#items')
pylab.ylabel('time [ms]')
pylab.legend()
pylab.show()
How can I select all values where the 'displacement' (second level of MultiIndex) is above a certain value, say > 2?
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
dicts = {}
index = np.linspace(1, 50)
index[2] = 2.0 # Create a duplicate for later testing
for n in range(5):
dicts['test' + str(n)] = pd.Series(np.linspace(0, 20) ** (n / 5),
index=index)
s = pd.concat(dicts, names=('test', 'displacement'))
# Something like this?
s[s.index['displacement'] > 2]
I tried reading the docs but couldn't work it out, even trying IndexSlice.
Bonus points: how to I select a range, say between 2 and 4?
Thanks in advance for any help.
import pandas as pd
import numpy as np
dicts = {}
index = np.linspace(1, 50)
for n in range(5):
dicts['test' + str(n)] = pd.Series(np.linspace(0, 20) ** (n / 5),
index=index)
s = pd.concat(dicts, names=('test', 'displacement'))
displacement = s.index.get_level_values('displacement')
r = s.loc[(displacement > 2) & (displacement < 5)]
Inspired by https://stackoverflow.com/a/18103894/268075