I have dataset of 3 parameters 'A','B','C' in .TXT file and after I print them in 24x20 matrices I need to collect the 1st elements of 'A','B','C' put in long arrays in panda dataframe and then 2nd elements of each then 3rd and so on till 480th elements.
So my data is like this in text file:
my data is txt file is following:
id_set: 000
A: -2.46882615679
B: -2.26408246559
C: -325.004619528
I already made a panda dataframe includes 3 columns of 'A','B','C' and index and defined functions to print 24x20 matric in right way. Simple example via 2x2 matrices:
1st cycle: A = [1,2, B = [4,5, C = [8,9,
3,4] 6,7] 10,11]
2nd cycle: A = [0,8, B = [1,9, C = [10,1,
2,5] 4,8] 2,7]
Reshape to this form:
A(1,1),B(1,1),C(1,1),A(1,2),B(1,2),C(1,2),.....
Result= [1,4,8,2,5,9,3,6,10,4,7,11] #1st cycle
[0,1,10,8,9,1,2,4,2,5,8,7] #2nd cycle
My scripts are following:
import numpy as np
import pandas as pd
import os
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
dft = pd.read_csv('D:\mc25.TXT', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
#next iteration create all plots, change the number of cycles
cycles = int(len(df)/480)
print(cycles)
for cycle in range(0,10):
count = '{:04}'.format(cycle)
j = cycle * 480
for i in df:
try:
os.mkdir(i)
except:
pass
min_val = df[i].min()
min_nor = -1
max_val = df[i].max()
max_nor = 1
ordered_data = mkdf(df.iloc[j:j+480][i])
csv = print_df(ordered_data)
#Print .csv files contains matrix of each parameters by name of cycles respectively
csv.to_csv(f'{i}/{i}{count}.csv', header=None, index=None)
if 'C' in i:
min_nor = -40
max_nor = 150
#Applying normalization for C between [-40,+150]
new_value3 = normalize(df['C'].iloc[j:j+480], min_val, max_val, -40, 150)
df3 = print_df(mkdf(new_value3))
df3.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
else:
#Applying normalization for A,B between [-1,+1]
new_value1 = normalize(df['A'].iloc[j:j+480], min_val, max_val, -1, 1)
new_value2 = normalize(df['B'].iloc[j:j+480], min_val, max_val, -1, 1)
df1 = print_df(mkdf(new_value1))
df2 = print_df(mkdf(new_value2))
df1.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df2.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
Note2: I provided a dataset in text file for 3 cycles:
Text dataset
I am not sure if I understood your question fully but this is a solution:
Convert your data frame to a 2d numpy array using as_matrix() then use ravel() to get a vector of size 480 * 3 then cycle over your cycles and use vstack method for stacking rows over each other in your result, this is a code with your example data:
A = [[1,2,3,4], [10,20,30,40]]
B = [[4,5,6,7], [40,50,60,70]]
C = [[8,9,10,11], [80,90,100,110]]
cycles = 2
for cycle in range(cycles):
data = {'A': A[cycle], 'B': B[cycle], 'C': C[cycle]}
df = pd.DataFrame(data)
D = df.as_matrix().ravel()
if cycle == 0:
Results = np.array(D)
else:
Results = np.vstack((Results, D2))
# Output: Results= array([[ 1, 4, 8, 2, 5, 9, 3, 6, 10, 4, 7, 11], [ 10, 40, 80, 20, 50, 90, 30, 60, 100, 40, 70, 110]], dtype=int64)
np.savetxt("Results.csv", Results, delimiter=",")
Is this what you wanted?
Related
I created this function that takes in a dataframe to return an ndarrays of input and label.
def transform_to_array(dataframe, chunk_size=100):
grouped = dataframe.groupby('id')
# initialize accumulators
X, y = np.zeros([0, 1, chunk_size, 4]), np.zeros([0,]) # original inpt shape: [0, 1, chunk_size, 4]
# loop over each group (df[df.id==1] and df[df.id==2])
for _, group in grouped:
inputs = group.loc[:, 'A':'D'].values
label = group.loc[:, 'label'].values[0]
# calculate number of splits
N = (len(inputs)-1) // chunk_size
if N > 0:
inputs = np.array_split(
inputs, [chunk_size + (chunk_size*i) for i in range(N)])
else:
inputs = [inputs]
# loop over splits
for inpt in inputs:
inpt = np.pad(
inpt, [(0, chunk_size-len(inpt)),(0, 0)],
mode='constant')
# add each inputs split to accumulators
X = np.concatenate([X, inpt[np.newaxis, np.newaxis]], axis=0)
y = np.concatenate([y, label[np.newaxis]], axis=0)
return X, y
The function returned X of shape (n_samples, 1, chunk_size, 4) and y of shape (n_samples, ).
For examples:
N = 10_000
id = np.arange(N)
labels = np.random.randint(5, size=N)
df = pd.DataFrame(data = np.random.randn(N, 4), columns=list('ABCD'))
df['label'] = labels
df.insert(0, 'id', id)
df = df.loc[df.id.repeat(157)]
df.head()
id A B C D label
0 0 -0.571676 -0.337737 -0.019276 -1.377253 1
0 0 -0.571676 -0.337737 -0.019276 -1.377253 1
0 0 -0.571676 -0.337737 -0.019276 -1.377253 1
0 0 -0.571676 -0.337737 -0.019276 -1.377253 1
0 0 -0.571676 -0.337737 -0.019276 -1.377253 1
To generate the followings:
X, y = transform_to_array(df)
X.shape # shape of input
(20000, 1, 100, 4)
y.shape # shape of label
(20000,)
This function works fine as intended, however, it takes long time to finish execution:
start_time = time.time()
X, y = transform_to_array(df)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds.')
Time taken: 227.83956217765808 seconds.
In attempt to improve performance of the function (minimise exec. time), I created the following modified func:
def modified_transform_to_array(dataframe, chunk_size=100):
# group data by 'id'
grouped = dataframe.groupby('id')
# initialize lists to store transformed data
X, y = [], []
# loop over each group (df[df.id==1] and df[df.id==2])
for _, group in grouped:
# get input and label data for group
inputs = group.loc[:, 'A':'D'].values
label = group.loc[:, 'label'].values[0]
# calculate number of splits
N = (len(inputs)-1) // chunk_size
if N > 0:
# split input data into chunks
inputs = np.array_split(
inputs, [chunk_size + (chunk_size*i) for i in range(N)])
else:
inputs = [inputs]
# loop over splits
for inpt in inputs:
# pad input data to have a chunk size of chunk_size
inpt = np.pad(
inpt, [(0, chunk_size-len(inpt)),(0, 0)],
mode='constant')
# add each input split and corresponding label to lists
X.append(inpt)
y.append(label)
# convert lists to numpy arrays
X = np.array(X)
y = np.array(y)
return X, y
At first, it seems like I succeeded reducing time taken:
start_time = time.time()
X2, y2 = modified_transform_to_array(df)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds.')
Time taken: 5.842168092727661 seconds.
However, the result is that it changes the shape of the intended returned value.
X2.shape # this should be (20000, 1, 100, 4)
(20000, 100, 4)
y.shape # this is fine
(20000, )
Question
How do I modify modified_transform_to_array() to return the intended array shape (n_samples, 1, chunk_size, 4) since it is much faster?
You can simply reshape the X just before returning it at the end of modified_transform_to_array(), e.g.:
def modified_transform_to_array( ... ):
...
# convert lists to numpy arrays
X = np.array(X)
y = np.array(y)
X = X.reshape((X.shape[0], 1, *X.shape[1:])) # <-- THIS LINE
return X, y
or, equivalently:
X = X.reshape((X.shape[0], 1, X.shape[1], X.shape[2]))
As pointed out in #MSS's answer, you can achieve the same reshaping result also with slicing, by starting from a a slicing where you are selecting the whole array (i.e. X[:, :, :]) and inserting a None (or its more explicit alias np.newaxis) in the position where you want to augment the number of dimensions:
X = X[:, None, :, :]
X = X[:, np.newaxis, :, :]
The last two slicing can be replaced by an Ellipsis ... which essentially produces enough full-axis slicing (i.e. : or slice(None)) to fill the whole array dimensions.
X = X[:, None, ...]
X = X[:, np.newaxis, ...]
You may want to read the relevant section of NumPy's user guide for further explanations on the use of None and Ellipsis in NumPy's slicing.
Add a new axis to your X just before returning it in modified_transform_to_array, e.g.:
def modified_transform_to_array( ... ):
...
# convert lists to numpy arrays
X = np.array(X)
y = np.array(y)
X = X[:, np.newaxis, ...] # <---in this place
# X = X[:, None, :, :]
return X, y
I am working on an optimization problem, and facing difficulty setting up two constraints together in Python. Hereunder, I am simplifying my problem by calculation of area and volume. Only length can be changed, other parameters should remain the same.
Constraint 1: Maximum area should be 40000m2
Constraint 2: Minimum volume should be 50000m3
Here, I can set values in dataframe by following both constraints one-by-one, how to modify code so that both constraints (1 & 2) should meet given requirements?
Many Thanks for your time and support!
import pandas as pd
import numpy as np
df = pd.DataFrame({'Name': ['A', 'B', 'C', 'D'],
'Length': [1000, 2000, 3000, 5000],
'Width': [5, 12, 14, 16],
'Depth': [15, 10, 15, 18]})
area = (df['Length'])*(df['Width'])
volume = (df['Length'])*(df['Width'])*(df['Depth'])
print(area)
print(volume)
#Width and Depth are constants, only Length can be change
#Constraint 1: Maximum area should be 40000m2
#Calculation of length parameter by using maximum area, with other given parameters
Constraint_length_a = 40000/ df['Width']
#Constraint 2: Minimum volume should be 50000m3
#Calculation of length parameter by using minimum area, with other given parameters
Constraint_length_v = 50000/ ((df['Width'])*(df['Depth']))
#Setting Length values considering constraint 1
df.at[0, 'Length']=Constraint_length_a[0]
df.at[1, 'Length']=Constraint_length_a[1]
df.at[2, 'Length']=Constraint_length_a[2]
df.at[2, 'Length']=Constraint_length_a[3]
#Setting Length values considering constraint 2
df.at[0, 'Length']=Constraint_length_v[0]
df.at[1, 'Length']=Constraint_length_v[1]
df.at[2, 'Length']=Constraint_length_v[2]
df.at[2, 'Length']=Constraint_length_v[3]
I believed the code below solve the current problem you are facing.
If I can help any further let me know.
import pandas as pd
import numpy as np
df = pd.DataFrame({'Name': ['A', 'B', 'C', 'D'],
'Length': [1000, 2000, 3000, 5000],
'Width': [5, 12, 14, 16],
'Depth': [15, 10, 15, 18]})
area = (df['Length'])*(df['Width'])
volume = (df['Length'])*(df['Width'])*(df['Depth'])
def constraint1(df, col, n):
df.loc[:n,'lenght'] = 40000 / df.loc[:n, col]
df.drop('Length', axis=1, inplace=True)
return df
def constraint2(df, col, col1, n):
df.loc[:n, 'lenght'] = 50000/ ((df.loc[:n,col])*(df.loc[:n,col1]))
df.drop('Length', axis=1, inplace=True)
return df
If you want to performance it through the whole column then
def constraint1a(df, col):
df['lenght'] = 40000 / df[col]
df.drop('Length', axis=1, inplace=True)
return df
def constraint2a(df, col, col1):
df['lenght'] = 50000/ ((df[col])*(df[col1]))
df.drop('Length', axis=1, inplace=True)
return df
df = constraint1(df, 'Width', 3)
df1 = constraint2(df, 'Width','Depth', 3)
df2 = constraint1a(df, 'Width')
df3 = constraint2a(df, 'Width','Depth')
Adding the conditions I left out the first time
def constraint1(df, col, col1):
l = []
for x, w in zip(df[col], df[col1]):
if x > 40000:
l.append(40000 / w)
else:
l.append(x)
df[col] = l
return df
def constraint2(df, col, col1, col2):
l = []
for x, w, d in zip(df[col], df[col1], df[col2]):
if x <= 50000:
l.append(50000 / (w*d))
else:
l.append(x)
df[col] = l
return df
df1 = constraint1(df, 'Length', 'Width')
df2 = constraint2(df, 'Length', 'Width', 'Depth')
I am trying to stack into rows (axis=0) the results of a calculation that results in 3D arrays. I don't know the results ahead of time.
import numpy as np
h = 10
w = 20
c = 30
result_4d = np.??? # empty
for i in range(5):
result_3d = np.zeros((h, w, c)) #fake calculation
result_4d = np.??? # stacked result_3ds on axis=0
return result_4d
I've tried various permutations of the numpy *stack calls but I inevitably run into shape mismatch errors.
Put it in a list first and then stack.
h = 10
w = 20
c = 30
l = []
for i in range(5):
result_3d = np.zeros((h, w, c)) #fake calculation
l.append(result_3d)
res = np.stack(l, axis=-1)
res.shape # (10, 20, 30, 5)
# move stacked axis around ...
np.transpose(res, (3,0,1,2)).shape # (5, 10, 20, 30)
If you want to update in loop, you can potentially do this:
res = ''
for i in range(5):
result_3d = np.zeros((h, w, c)) #fake calculation
if type(res) is str:
res = np.array([result_3d]) # add dimension
continue
res = np.vstack((res, np.array([result_3d]))) # stack on that dimension
res.shape # (5, 10, 20, 30)
Pandas has a widely-used groupby facility to split up a DataFrame based on a corresponding mapping, from which you can apply a calculation on each subgroup and recombine the results.
Can this be done flexibly in NumPy without a native Python for-loop? With a Python loop, this would look like:
>>> import numpy as np
>>> X = np.arange(10).reshape(5, 2)
>>> groups = np.array([0, 0, 0, 1, 1])
# Split up elements (rows) of `X` based on their element wise group
>>> np.array([X[groups==i].sum() for i in np.unique(groups)])
array([15, 30])
Above 15 is the sum of the first three rows of X, and 30 is the sum of the remaining two.
By "flexibly,” I just mean that we aren't focusing on one particular computation such as sum, count, maximum, etc, but rather passing any computation to the grouped arrays.
If not, is there a faster approach than the above?
How about using scipy sparse matrix
import numpy as np
from scipy import sparse
import time
x_len = 500000
g_len = 100
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
# original
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
# using scipy sparse matrix
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
result on my PC
0.15915322303771973
0.012875080108642578
0
More than 10 times faster.
Update!
Let's benchmark answers of #Paul Panzer and #Daniel F. It is summation only benchmark.
import numpy as np
from scipy import sparse
import time
# by #Daniel F
def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
if minlength < groups.max() + 1:
minlength = groups.max() + 1
if identity is None:
identity = uf.identity
i = list(range(X.ndim))
del i[axis]
i = tuple(i)
n = out is None
if n:
if identity is None: # fallback to loops over 0-index for identity
assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
s = [slice(None)] * X.ndim
for i_ in i:
s[i_] = 0
out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
else:
out = np.full((minlength,), identity, dtype = X.dtype)
uf.at(out, groups, uf.reduce(X, i))
if n:
return out
x_len = 500000
g_len = 200
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
print("original")
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
print("use scipy coo matrix")
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
print("use scipy csr matrix #Daniel F")
s = time.time()
x_sum = X.sum(axis=1)
c = np.array(sparse.csr_matrix(
(
x_sum,
groups,
np.arange(len(groups)+1)
),
shape=(len(groups), g_len)
).sum(axis=0)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-c)).sum())
print("use bincount #Paul Panzer #Daniel F")
s = time.time()
d = np.bincount(groups, X.sum(axis=1), g_len)
print(time.time() - s)
#compare
print(np.abs((a-d)).sum())
print("use ufunc #Daniel F")
s = time.time()
e = groupby_np(X, groups)
print(time.time() - s)
#compare
print(np.abs((a-e)).sum())
STDOUT
original
0.2882847785949707
use scipy coo matrix
0.012301445007324219
0
use scipy csr matrix #Daniel F
0.01046299934387207
0
use bincount #Paul Panzer #Daniel F
0.007468223571777344
0.0
use ufunc #Daniel F
0.04431319236755371
0
The winner is the bincount solution. But the csr matrix solution is also very interesting.
#klim's sparse matrix solution would at first sight appear to be tied to summation. We can, however, use it in the general case by converting between the csr and csc formats:
Let's look at a small example:
>>> m, n = 3, 8
>>> idx = np.random.randint(0, m, (n,))
>>> data = np.arange(n)
>>>
>>> M = sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m))
>>>
>>> idx
array([0, 2, 2, 1, 1, 2, 2, 0])
>>>
>>> M = M.tocsc()
>>>
>>> M.indptr, M.indices
(array([0, 2, 4, 8], dtype=int32), array([0, 7, 3, 4, 1, 2, 5, 6], dtype=int32))
As we can see after conversion the internal representation of the sparse matrix yields the indices grouped and sorted:
>>> groups = np.split(M.indices, M.indptr[1:-1])
>>> groups
[array([0, 7], dtype=int32), array([3, 4], dtype=int32), array([1, 2, 5, 6], dtype=int32)]
>>>
We could have obtained the same using a stable argsort:
>>> np.argsort(idx, kind='mergesort')
array([0, 7, 3, 4, 1, 2, 5, 6])
>>>
But sparse matrices are actually faster, even when we allow argsort to use a faster non-stable algorithm:
>>> m, n = 1000, 100000
>>> idx = np.random.randint(0, m, (n,))
>>> data = np.arange(n)
>>>
>>> timeit('sparse.csr_matrix((data, idx, np.arange(n+1)), (n, m)).tocsc()', **kwds)
2.250748165184632
>>> timeit('np.argsort(idx)', **kwds)
5.783584725111723
If we require argsort to keep groups sorted, the difference is even larger:
>>> timeit('np.argsort(idx, kind="mergesort")', **kwds)
10.507467685034499
If you want a more flexible implementation of groupby that can group using any of numpy's ufuncs:
def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
if minlength < groups.max() + 1:
minlength = groups.max() + 1
if identity is None:
identity = uf.identity
i = list(range(X.ndim))
del i[axis]
i = tuple(i)
n = out is None
if n:
if identity is None: # fallback to loops over 0-index for identity
assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
s = [slice(None)] * X.ndim
for i_ in i:
s[i_] = 0
out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
else:
out = np.full((minlength,), identity, dtype = X.dtype)
uf.at(out, groups, uf.reduce(X, i))
if n:
return out
groupby_np(X, groups)
array([15, 30])
groupby_np(X, groups, uf = np.multiply)
array([ 0, 3024])
groupby_np(X, groups, uf = np.maximum)
array([5, 9])
groupby_np(X, groups, uf = np.minimum)
array([0, 6])
There's probably a faster way than this (both of the operands are making copies right now), but:
np.bincount(np.broadcast_to(groups, X.T.shape).ravel(), X.T.ravel())
array([ 15., 30.])
If you want to extend the answer to a ndarray, and still have a fast computation, you could extend the Daniel's solution :
x_len = 500000
g_len = 200
y_len = 2
X = np.arange(x_len * y_len).reshape(x_len, y_len)
groups = np.random.randint(0, g_len, x_len)
# original
a = np.array([X[groups==i].sum(axis=0) for i in np.unique(groups)])
# alternative
bins = [0] + list(np.bincount(groups, minlength=g_len).cumsum())
Z = np.argsort(groups)
d = np.array([X.take(Z[bins[i]:bins[i+1]],0).sum(axis=0) for i in range(g_len)])
It took about 30 ms (15ms for creating bins + 15ms for summing) instead of 280 ms on the original way in this example.
d.shape
>>> (1000, 2)
I have a six-dimensional numeric array A, and I want to reshape it into a two-dimensional array. The rows of the resulting matrix should be multi-indexed by the first three dimensions of A, and the columns should be multi-indexed by the last three dimensions of A. What is the best way to achieve this using pandas or numpy?
Here is a handy function to do just this.
def make2d(a):
shape = a.shape
n = len(shape)
col_lvls = n // 2
idx_lvls = n - col_lvls
midx = pd.MultiIndex.from_product(
[range(i) for i in shape[:idx_lvls]],
names=['d-{}'.format(d) for d in range(1, idx_lvls + 1)])
mcol = pd.MultiIndex.from_product(
[range(i) for i in shape[idx_lvls:]],
names=['d-{}'.format(d) for d in range(idx_lvls + 1, idx_lvls + col_lvls + 1)])
return pd.DataFrame(
a.reshape(np.array(shape[:3]).prod(), -1),
midx, mcol
)
demonstration
a = np.arange(216).reshape(2, 3, 2, 3, 2, 3)
make2d(a)