I have the following algorithm that iterates over a DataFrame with few millions of rows.
It takes a lot of time for the algorithm to finish. Do you have any suggestions?
def k_nn_averaging(df: pd.DataFrame, k: int = 15, use_abs_value: bool = False) -> pd.DataFrame:
df_averaged = df.copy()
df[helper.modifiable_columns] = df[helper.modifiable_columns].astype(float)
df_averaged[helper.modifiable_columns] = df_averaged[helper.modifiable_columns].astype(float)
for i in range(0, df.shape[0]):
neighbours = list(range(i-k if i-k >= 0 else 0, i+k if i+k <= df_averaged.shape[0] else df_averaged.shape[0]))
neighbours.remove(i)
selectedNeighbourIndex = choice(neighbours)
factor = uniform(0,1)
currentSampleValues = df[helper.modifiable_columns].iloc[i]
neighbourSampleValues = df[helper.modifiable_columns].iloc[selectedNeighbourIndex]
average = 0
if not use_abs_value: average = factor*(currentSampleValues - neighbourSampleValues)
else: average = factor*(abs(currentSampleValues - neighbourSampleValues))
df_averaged.loc[i,helper.modifiable_columns] = currentSampleValues + average
return df_averaged
The first thing you should always want is to vectorize loops. Here is the modified code that avoids using Python loops and uses NumPy operations instead:
import pandas as pd
import numpy as np
def k_nn_averaging(df: pd.DataFrame, k: int = 15, use_abs_value: bool = False) -> pd.DataFrame:
df_averaged = df.copy()
df_averaged[helper.modifiable_columns] = df_averaged[helper.modifiable_columns].astype(float)
num_rows = df.shape[0]
modifiable_columns = helper.modifiable_columns
# create a matrix of the neighbour indices for each row
neighbour_indices = np.empty((num_rows, k*2+1), dtype=int)
neighbour_indices[:, k] = np.arange(num_rows) # set the current row index as the middle value
for i in range(k):
# set the left neighbours
neighbour_indices[i+1:, i] = neighbour_indices[i:-1, k] - 1
# set the right neighbours
neighbour_indices[:-i-1, k+i+1] = neighbour_indices[1:, k] + 1
# set the values outside the range of the DataFrame to -1
neighbour_indices[neighbour_indices < 0] = -1
neighbour_indices[neighbour_indices >= num_rows] = -1
# select the neighbour indices to use for each row
selected_neighbour_indices = neighbour_indices[:, neighbour_indices[0] >= 0]
# create a matrix of factors
factors = np.random.uniform(size=(num_rows, selected_neighbour_indices.shape[1]))
# select the neighbour values for each row
neighbour_values = df[modifiable_columns].values[selected_neighbour_indices]
# select the current values for each row
current_values = df[modifiable_columns].values[:, np.newaxis]
# calculate the average values
if not use_abs_value:
averages = factors * (current_values - neighbour_values)
else:
averages = factors * np.abs(current_values - neighbour_values)
# update the values in the output DataFrame
df_averaged[modifiable_columns] = current_values + averages
return df_averaged
I think this will be much faster than the original script.
I am trying to swap two indices in the 2D array of NumPy. Unfortunately, only one element is getting swapped. Here is the code:
n = len(A)
perMatrix = np.zeros((n,n))
np.fill_diagonal(perMatrix, 1)
perMatrix = A
# swapping the row
print(perMatrix)
temp = perMatrix[switchIndex1]
print(temp)
# perMatrix[switchIndex1][0] = 14
perMatrix[switchIndex1], perMatrix[switchIndex2] = perMatrix[switchIndex2], perMatrix[switchIndex1]
print(perMatrix)
Here's what the code is outputting:
You could just add (on the line after perMatrix is created):
sigma = [switchIndex1, switchIndex2]
tau = [switchIndex2, switchIndex1]
perMatrix[sigma,:] = perMatrix[tau,:]
I want to make a numpy array by using np.random.randint(2, size=(4,4)).
And its size can be changeable.
I want to print the top k-row indexes from the weakest to the strongest (weaker: smaller number of ones) so I made a code like this
import numpy as np
arr = np.random.randint(0, 2, size = (r,c))
r, c = 5, 5
print(a)
def weak(array, k):
np.array(array)
sum_nums = []
for i in range(len(array)):
sum_nums.append((i, sum([i])))
sorted_sum_nums = sorted(sum_nums, key=lambda x: x[1])
answer = [idx[0] for idx in sorted_sum_nums[:int(k)]]
return answer
weak(a, 5)
I try this code, but it doesn't work well.
How can I change it?
Pay attention to the indentation, you return prematurely :
def weak(array, k):
np.array(array)
sum_nums = []
for i in range(len(array)):
sum_nums.append((i, sum([i])))
sorted_sum_nums = sorted(sum_nums, key=lambda x: x[1])
answer = [idx[0] for idx in sorted_sum_nums[:int(k)]]
return answer # <== here
Firstly you should initialized r and c before you call them, then you called weak(a,5)
a is not initialized i think you wanted to call arr instead. Below I returned minimum row of array
import numpy as np
r, c = 5, 5
arr = np.random.randint(0, 2, size = (r,c))
def weak(array, k):
array =np.array(array)
sum_nums = []
summed =[[],[]]
for each in array:
summ = sum(each)
summed[0].append(summ)
summed[1].append(array.index(each))
return summed[1][summed[0].index(min(summed[0]))]
weak(arr, 5)
If you want to print result to screen.
print(weak(arr,5))
My Python code generates some matrices (one at a time) through a loop over some index called i.
Storing matrices with names like mat_0, mat_1,..., mat_i is straightforward but I was wondering if it somehow possible to store matrices like iterable elements like mat[0], mat[1],...,mat[i]?
Note: The matrices are stored in scipy sparse coo_matrix format.
Edit 1 : The index i does not necessarily follow a proper sequence and may loop over some random numbers like 0,2,3,7,... In that case the matrices have to be stored as mat[0], mat[2], mat[3], mat[7],... and so on.
Edit 2: Minimal working code
import numpy as np
from math import sqrt
from scipy.sparse import coo_matrix, csr_matrix
primesqt = np.array([1.41421356, 1.73205080, 2.23606797, 2.64575131, 3.31662479, 3.60555127, 4.12310562, 4.35889894, 4.79583152, 5.38516480, 5.56776436, 6.08276253, 6.40312423, 6.55743852, 6.85565460, 7.28010988, 7.68114574, 7.81024967, 8.18535277, 8.42614977, 8.54400374, 8.88819441, 9.11043357, 9.43398113, 9.84885780, 10.04987562, 10.14889156, 10.34408043, 10.44030650, 10.63014581, 11.26942766, 11.44552314, 11.70469991, 11.78982612, 12.20655561, 12.28820572, 12.52996408, 12.76714533, 12.92284798, 13.15294643, 13.37908816, 13.45362404, 13.82027496, 13.89244398, 14.03566884, 14.10673597, 14.52583904, 14.93318452, 15.06651917, 15.13274595])
def bg(n, k, min_elem, max_elem):
allowed = range(max_elem, min_elem-1, -1)
def helper(n, k, t):
if k == 0:
if n == 0:
yield t
elif k == 1:
if n in allowed:
yield t + (n,)
elif min_elem * k <= n <= max_elem * k:
for v in allowed:
yield from helper(n - v, k - 1, t + (v,))
return helper(n, k, ())
def BinarySearch(lys, val):
first = 0
last = len(lys)-1
index = -1
while (first <= last) and (index == -1):
mid = (first+last)//2
if lys[mid] == val:
index = mid
else:
if val<lys[mid]:
last = mid -1
else:
first = mid +1
return index
m = 4
dim = 16
nmax = 1
a = []
for n in range(0,(nmax*m)+1):
for x in bg(n, m, 0, nmax):
a.append(x)
T = np.zeros(dim)
for ii in range(dim):
for jj in range(m):
T[ii] += primesqt[jj]*float(a[ii][jj])
ind = np.argsort(T)
T = sorted(T)
all_bs = [0,2,3,7] # i_list
# Evaluate 'mat_ee' for each 'ee' given in the list 'all_bs'
for ee in all_bs:
row = []
col = []
val = []
for ii in range(m):
for vv in range(dim):
Tg = 0
if a[vv][ii]+1 < nmax+1:
k = np.copy(a[vv])
elem = sqrt(float(k[ii]+1.0))+ee
k[ii] = k[ii]+1
# Generate tag Tg for elem != 0
for jj in range(m):
Tg += float((primesqt[jj])*k[jj])
# Search location of non-zero element in sorted T
location = BinarySearch(T, Tg)
uu = ind[location]
row.append(uu)
col.append(vv)
val.append(elem)
mat_ee = (coo_matrix((val, (row, col)), shape=(dim, dim)).tocsr()) # To be stored as mat[0], mat[2], mat[3], mat[7]
print(mat_ee)
A dictionary would allow you to reference an object using an arbitrary (but immutable) object. In your case, you could store the matrices mat_ee in each iteration of the outer loop (for ee in all_bs:) using that ee index:
csr_matrices = {}
for ee in all_bs:
# your inner loops, all the way to…
mat_ee = (coo_matrix((val, (row, col)),
shape=(dim, dim))
.tocsr())
csr_matrices[ee] = mat_ee
From that moment, you can access the elements of the dictionaries using the indices you had in all_bs:
print(csr_matrices[2])
and when you inspect the dictionary, you’ll notice it only contains the keys you specified:
print(csr_matrices.keys())
You could use a List of your objects.
items_list = list()
for something:
result = function
items_list.append(result)
I am trying to do image processing using python.
I try to create a list which holds numpy.ndarrays.
My code looks like this,
def Minimum_Close(Shade_Corrected_Image, Size):
uint32_Shade_Corrected_Image = pymorph.to_int32(Shade_Corrected_Image)
Angles = []
[Row, Column] = Shade_Corrected_Image.shape
Angles = [i*15 for i in range(12)]
Image_Close = [0 for x in range(len(Angles))]
Image_Closing = numpy.zeros((Row, Column))
for s in range(len(Angles)):
Struct_Element = pymorph.seline(Size, Angles[s])
Image_Closing = pymorph.close(uint32_Shade_Corrected_Image,Struct_Element )
Image_Close[s] = Image_Closing
Min_Close_Image = numpy.zeros(Shade_Corrected_Image.shape)
temp_array = [][]
Temp_Cell = numpy.zeros((Row, Column))
for r in range (1, Row):
for c in range(1,Column):
for Cell in Image_Close:
Temp_Cell = Image_Close[Cell]
temp_array[Cell] = Temp_Cell[r][c]
Min_Close_Image[r][c] = min(temp_array)
Min_Close_Image = Min_Close_Image - Shade_Corrected_Image
return Min_Close_Image
While running this code I'm getting error:
Temp_Cell = Image_Close[Cell]
TypeError: only integer arrays with one element can be converted to an index
How can I make a data structure which holds different multi-dimensional arrays and then traverse through it??
Making a list of arrays is not necessary when you're using numpy.
I suggest rewriting the whole function like this:
def Minimum_Close(shade_corrected_image, size):
uint32_shade_corrected_image = pymorph.to_int32(shade_corrected_image)
angles = np.arange(12) * 15
def pymorph_op(angle):
struct_element = pymorph.seline(size, angle)
return pymorph.close(uint32_shade_corrected_image, struct_element)
image_close = np.dstack(pymorph_op(a) for a in angles)
min_close_image = np.min(image_close, axis=-1) - shade_corrected_image
return min_close_image
I lower cased variable names so that they stop getting highlighted as classes.
What about:
for cnt,Cell in enumerate(Image_Close):
Temp_Cell = Image_Close[cnt]