Row, column assignment without for-loop - python

I wrote a small script to assign values to a numpy array by knowing their row and column coordinates:
gridarray = np.zeros([3,3])
gridarray_counts = np.zeros([3,3])
cols = np.random.random_integers(0,2,15)
rows = np.random.random_integers(0,2,15)
data = np.random.random_integers(0,9,15)
for nn in np.arange(len(data)):
gridarray[rows[nn],cols[nn]] += data[nn]
gridarray_counts[rows[nn],cols[nn]] += 1
In fact, then I know how many values are stored in the same grid cell and what the sum is of them. However, performing this on arrays of lengths 100000+ it is getting quite slow. Is there another way without using a for-loop?
Is an approach similar to this possible? I know this is not working yet.
gridarray[rows,cols] += data
gridarray_counts[rows,cols] += 1

I would use bincount for this, but for now bincount only takes 1darrays so you'll need to write your own ndbincout, something like:
def ndbincount(x, weights=None, shape=None):
if shape is None:
shape = x.max(1) + 1
x = np.ravel_multi_index(x, shape)
out = np.bincount(x, weights, minlength=np.prod(shape))
out.shape = shape
return out
Then you can do:
gridarray = np.zeros([3,3])
cols = np.random.random_integers(0,2,15)
rows = np.random.random_integers(0,2,15)
data = np.random.random_integers(0,9,15)
x = np.vstack([rows, cols])
temp = ndbincount(x, data, gridarray.shape)
gridarray = gridarray + temp
gridarray_counts = ndbincount(x, shape=gridarray.shape)

You can do this directly:
gridarray[(rows,cols)]+=data
gridarray_counts[(rows,cols)]+=1

Related

Dataframes from arrays with different length - fill missing values by rmean of row

I'm want to create a dataframe, out of arrays with different size. I want to fill the missing values depending on similar values.
I've tried to stick the arrays together and do a sort and a split with numpy. I've then calculate the mean of the splits and decide wether its a value close to the mean or its better fill with nan.
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return idx
#generate sample data
loa = [((np.arange(np.random.randint(1,3),np.random.randint(3,6)))*val).tolist()
for val in np.random.uniform(0.9,1.1,5)]
#reshape
flat_list = sum(loa,[])
#add some attributes
attributes = [np.random.randint(-3,-1) for x in range(len(flat_list))]
#sort and split on percentage change
flat_list.sort()
arr = np.array(flat_list)
arr_splits = np.split(arr, np.argwhere(np.diff(arr)/arr[1:]*100 > 12)[:,0])
#means of the splits
means = [np.mean(arr) for arr in arr_splits]
#create dataframe
i = 0
res = np.zeros((len(loa), len(means)*2))*np.nan
for row, l in enumerate(loa):
for val in l:
col = find_nearest(means, val)
res[row, col] = val
res[row, col+len(means)] = attributes[i]
i = i + 1
df = pd.DataFrame(res)
Is there another way, to do this stuff more directly with pandas? ... or something more elegant?

numpy - could not broadcast input unknown error

I am attempting to run the following code, but am getting the following error:
line 71, in cross_validation
folds[index] = numpy.vstack((folds[index], dataset[jindex])). ValueError: could not broadcast input array from shape (2,8) into shape (8)
What is interesting is that when I print out the shapes of the two items I am trying to use in the vstack, they have the same shape (8,)
I am trying to determine why this line of the function is failing. Any advice would be greatly appreciated.
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def create_folds(dataset):
length = len(dataset)
folds = numpy.empty_like(dataset)
for index in range(5):
tempArray = numpy.ndarray(shape=(1, length))
numpy.append(folds, tempArray)
temp_class_array = numpy.ndarray(shape=(1,1))
numpy.append(folds, temp_class_array)
return folds
def class_distribution(dataset):
dataset = numpy.asarray(dataset)
num_total_rows = dataset.shape[0]
num_columns = dataset.shape[1]
classes = dataset[:,num_columns-1]
classes = numpy.unique(classes)
class_weights = []
for aclass in classes:
total = 0
weight = 0
for row in dataset:
if numpy.array_equal(aclass, row[-1]):
total = total + 1
else:
continue
weight = float((total/num_total_rows))
class_weights.append(weight)
class_weights = numpy.asarray(class_weights)
return classes, class_weights
def cross_validation(dataset):
classes, class_weights = class_distribution(dataset)
total_length = len(dataset)
folds = create_folds(dataset)
added_so_far = 0
for a_class, a_class_weight in zip(classes, class_weights):
amt_for_fold = float(((a_class_weight * total_length) / 5)-1)
for index in range(0,10,2):
added = 0
for jindex in range(len(classes)):
if added >= amt_for_fold:
break
if classes[jindex] == a_class:
print(folds[index].shape)
print(dataset[jindex].shape)
folds[index] = numpy.vstack((folds[index], dataset[jindex]))
# print(folds)
folds[index + 1] = numpy.vstack((folds[index + 1], [classes[jindex]]))
if index < 8:
dataset = numpy.delete(dataset, jindex, 0)
classes = numpy.delete(classes, jindex, 0)
added_so_far = added_so_far + 1
for xindex in range(len(folds)):
folds[xindex] = numpy.delete(folds[xindex], 0, 0)
print(folds)
return folds
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
cross_validation(ecoli)
main()
On the following dataset:
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
0.52,0.39,0.48,0.5,0.65,0.71,0.73,1
0.29,0.47,0.48,0.5,0.71,0.65,0.69,1
0.55,0.47,0.48,0.5,0.57,0.78,0.8,1
0.12,0.67,0.48,0.5,0.74,0.58,0.63,1
0.4,0.5,0.48,0.5,0.65,0.82,0.84,1
0.73,0.36,0.48,0.5,0.53,0.91,0.92,1
0.84,0.44,0.48,0.5,0.48,0.71,0.74,1
0.48,0.45,0.48,0.5,0.6,0.78,0.8,1
0.54,0.49,0.48,0.5,0.4,0.87,0.88,1
0.48,0.41,0.48,0.5,0.51,0.9,0.88,1
0.5,0.66,0.48,0.5,0.31,0.92,0.92,1
0.72,0.46,0.48,0.5,0.51,0.66,0.7,1
0.47,0.55,0.48,0.5,0.58,0.71,0.75,1
0.33,0.56,0.48,0.5,0.33,0.78,0.8,1
0.64,0.58,0.48,0.5,0.48,0.78,0.73,1
0.11,0.5,0.48,0.5,0.58,0.72,0.68,1
0.31,0.36,0.48,0.5,0.58,0.94,0.94,1
0.68,0.51,0.48,0.5,0.71,0.75,0.78,1
0.69,0.39,0.48,0.5,0.57,0.76,0.79,1
0.52,0.54,0.48,0.5,0.62,0.76,0.79,1
0.46,0.59,0.48,0.5,0.36,0.76,0.23,1
0.36,0.45,0.48,0.5,0.38,0.79,0.17,1
0,0.51,0.48,0.5,0.35,0.67,0.44,1
0.1,0.49,0.48,0.5,0.41,0.67,0.21,1
0.3,0.51,0.48,0.5,0.42,0.61,0.34,1
0.61,0.47,0.48,0.5,0,0.8,0.32,1
0.63,0.75,0.48,0.5,0.64,0.73,0.66,1
0.71,0.52,0.48,0.5,0.64,1,0.99,1
0.72,0.42,0.48,0.5,0.65,0.77,0.79,2
0.79,0.41,0.48,0.5,0.66,0.81,0.83,2
0.83,0.48,0.48,0.5,0.65,0.76,0.79,2
0.69,0.43,0.48,0.5,0.59,0.74,0.77,2
0.79,0.36,0.48,0.5,0.46,0.82,0.7,2
0.78,0.33,0.48,0.5,0.57,0.77,0.79,2
0.75,0.37,0.48,0.5,0.64,0.7,0.74,2
0.59,0.29,0.48,0.5,0.64,0.75,0.77,2
0.67,0.37,0.48,0.5,0.54,0.64,0.68,2
0.66,0.48,0.48,0.5,0.54,0.7,0.74,2
0.64,0.46,0.48,0.5,0.48,0.73,0.76,2
0.76,0.71,0.48,0.5,0.5,0.71,0.75,2
0.84,0.49,0.48,0.5,0.55,0.78,0.74,2
0.77,0.55,0.48,0.5,0.51,0.78,0.74,2
0.81,0.44,0.48,0.5,0.42,0.67,0.68,2
0.58,0.6,0.48,0.5,0.59,0.73,0.76,2
0.63,0.42,0.48,0.5,0.48,0.77,0.8,2
0.62,0.42,0.48,0.5,0.58,0.79,0.81,2
0.86,0.39,0.48,0.5,0.59,0.89,0.9,2
0.81,0.53,0.48,0.5,0.57,0.87,0.88,2
0.87,0.49,0.48,0.5,0.61,0.76,0.79,2
0.47,0.46,0.48,0.5,0.62,0.74,0.77,2
0.76,0.41,0.48,0.5,0.5,0.59,0.62,2
0.7,0.53,0.48,0.5,0.7,0.86,0.87,2
0.64,0.45,0.48,0.5,0.67,0.61,0.66,2
0.81,0.52,0.48,0.5,0.57,0.78,0.8,2
0.73,0.26,0.48,0.5,0.57,0.75,0.78,2
0.49,0.61,1,0.5,0.56,0.71,0.74,2
0.88,0.42,0.48,0.5,0.52,0.73,0.75,2
0.84,0.54,0.48,0.5,0.75,0.92,0.7,2
0.63,0.51,0.48,0.5,0.64,0.72,0.76,2
0.86,0.55,0.48,0.5,0.63,0.81,0.83,2
0.79,0.54,0.48,0.5,0.5,0.66,0.68,2
0.57,0.38,0.48,0.5,0.06,0.49,0.33,2
0.78,0.44,0.48,0.5,0.45,0.73,0.68,2
0.78,0.68,0.48,0.5,0.83,0.4,0.29,3
0.63,0.69,0.48,0.5,0.65,0.41,0.28,3
0.67,0.88,0.48,0.5,0.73,0.5,0.25,3
0.61,0.75,0.48,0.5,0.51,0.33,0.33,3
0.67,0.84,0.48,0.5,0.74,0.54,0.37,3
0.74,0.9,0.48,0.5,0.57,0.53,0.29,3
0.73,0.84,0.48,0.5,0.86,0.58,0.29,3
0.75,0.76,0.48,0.5,0.83,0.57,0.3,3
0.77,0.57,0.48,0.5,0.88,0.53,0.2,3
0.74,0.78,0.48,0.5,0.75,0.54,0.15,3
0.68,0.76,0.48,0.5,0.84,0.45,0.27,3
0.56,0.68,0.48,0.5,0.77,0.36,0.45,3
0.65,0.51,0.48,0.5,0.66,0.54,0.33,3
0.52,0.81,0.48,0.5,0.72,0.38,0.38,3
0.64,0.57,0.48,0.5,0.7,0.33,0.26,3
0.6,0.76,1,0.5,0.77,0.59,0.52,3
0.69,0.59,0.48,0.5,0.77,0.39,0.21,3
0.63,0.49,0.48,0.5,0.79,0.45,0.28,3
0.71,0.71,0.48,0.5,0.68,0.43,0.36,3
0.68,0.63,0.48,0.5,0.73,0.4,0.3,3
0.74,0.49,0.48,0.5,0.42,0.54,0.36,4
0.7,0.61,0.48,0.5,0.56,0.52,0.43,4
0.66,0.86,0.48,0.5,0.34,0.41,0.36,4
0.73,0.78,0.48,0.5,0.58,0.51,0.31,4
0.65,0.57,0.48,0.5,0.47,0.47,0.51,4
0.72,0.86,0.48,0.5,0.17,0.55,0.21,4
0.67,0.7,0.48,0.5,0.46,0.45,0.33,4
0.67,0.81,0.48,0.5,0.54,0.49,0.23,4
0.67,0.61,0.48,0.5,0.51,0.37,0.38,4
0.63,1,0.48,0.5,0.35,0.51,0.49,4
0.57,0.59,0.48,0.5,0.39,0.47,0.33,4
0.71,0.71,0.48,0.5,0.4,0.54,0.39,4
0.66,0.74,0.48,0.5,0.31,0.38,0.43,4
0.67,0.81,0.48,0.5,0.25,0.42,0.25,4
0.64,0.72,0.48,0.5,0.49,0.42,0.19,4
0.68,0.82,0.48,0.5,0.38,0.65,0.56,4
0.32,0.39,0.48,0.5,0.53,0.28,0.38,4
0.7,0.64,0.48,0.5,0.47,0.51,0.47,4
0.63,0.57,0.48,0.5,0.49,0.7,0.2,4
0.69,0.65,0.48,0.5,0.63,0.48,0.41,4
0.43,0.59,0.48,0.5,0.52,0.49,0.56,4
0.74,0.56,0.48,0.5,0.47,0.68,0.3,4
0.71,0.57,0.48,0.5,0.48,0.35,0.32,4
0.61,0.6,0.48,0.5,0.44,0.39,0.38,4
0.59,0.61,0.48,0.5,0.42,0.42,0.37,4
0.74,0.74,0.48,0.5,0.31,0.53,0.52,4
The vstack() is returning a shape (2,8) array.
You're then assigning that (2,8) array to the LHS folds[index], which is just a shape (8,) array.
numpy tries to see if such a mismatched assignment can be justified by broadcasting, subject to the rules and constraints of broadcasting, and is finally giving up, with that error message.
Not sure what your actual intent is, so I'm not able to suggest alternative.
My guess is that folds should actually be created as a 3d array, in which each inner 2d array has as many rows as the length of each fold.
I also have this suspicion that, the line folds = numpy.empty_like(dataset) is based on some wrong understanding of numpy.empty_like(). Please double-check that.
I think you might be misunderstanding what vstack does. Given two vectors with 8 items it will stack them vertically and you will get a 2x8 matrix. Indeed the output will always be at lead 2D. See doc and the examples in https://docs.scipy.org/doc/numpy/reference/generated/numpy.vstack.html
E.g.
a = np.array([1,2,3])
b = np.array([1,2,3])
np.vstack((a,b))
outputs
array([[1, 2, 3],
[1, 2, 3]])

Python: how to make conditional operation in an array

I have an numpy array M of dimension NxM and a dataframe tmp containing the information of the cell of the array.
If I have to add values to the cell of M, I do
M[tmp.a, tmp.b] = tmp1.n
However I would like to add the values only to those cells in which M < tmp.n, something like
M[M[tmp.a, tmp.b] < tmp1.n] = tmp1.n
I solved in this way
s = shape(M)
M0 = np.zeros((s[1], s[0]))
M0[tmp1.a, tmp1.b] += tmp1.n
idx = np.where(M < M0)
M[idx[:][0], idx[:][1]] = M0[idx[:][0], idx[:][1]]
If I understood you correctly you may do something like:
M[tmp.a, tmp.b] = max(tmp1.n, M[tmp.a, tmp.b])
This can be done using Numpy logical indexing
# a logical (boolean) array
log = M < tmp.n
# apply it to source and target and use `+=` to add the values
M[log] += tmp.n[log]
If the arrays don't have the same shape then you can also pick a specific dimension:
log = M[:, 0] < tmp.n
# apply it to source and target and use `+=` to add the values
M[log, 0] += tmp.n[log]

filling numpy array by index

I have a function which gives me the index for a given value. Eg,
def F(value):
index = do_something(value)
return index
I want to use this index to fill a huge numpy array by 1s. Lets call array features
l = [1,4,2,3,7,5,3,6,.....]
NOTE: features.shape[0] = len(l)
for i in range(features.shape[0]):
idx = F(l[i])
features[i, idx] = 1
Is there a pythonic way to perform this (as the loop takes a lot of time if the array is huge)?
If you can vectorize F(value) you could write something like
indices = np.arange(features.shape[0])
feature_indices = F(l)
features.flat[indices, feature_indices] = 1
try this:
i = np.arange(features.shape[0]) # rows
j = np.vectorize(F)(np.array(l)) # columns
features[i,j] = 1

Can a python list hold a multi-dimentional array as its element?

I am trying to do image processing using python.
I try to create a list which holds numpy.ndarrays.
My code looks like this,
def Minimum_Close(Shade_Corrected_Image, Size):
uint32_Shade_Corrected_Image = pymorph.to_int32(Shade_Corrected_Image)
Angles = []
[Row, Column] = Shade_Corrected_Image.shape
Angles = [i*15 for i in range(12)]
Image_Close = [0 for x in range(len(Angles))]
Image_Closing = numpy.zeros((Row, Column))
for s in range(len(Angles)):
Struct_Element = pymorph.seline(Size, Angles[s])
Image_Closing = pymorph.close(uint32_Shade_Corrected_Image,Struct_Element )
Image_Close[s] = Image_Closing
Min_Close_Image = numpy.zeros(Shade_Corrected_Image.shape)
temp_array = [][]
Temp_Cell = numpy.zeros((Row, Column))
for r in range (1, Row):
for c in range(1,Column):
for Cell in Image_Close:
Temp_Cell = Image_Close[Cell]
temp_array[Cell] = Temp_Cell[r][c]
Min_Close_Image[r][c] = min(temp_array)
Min_Close_Image = Min_Close_Image - Shade_Corrected_Image
return Min_Close_Image
While running this code I'm getting error:
Temp_Cell = Image_Close[Cell]
TypeError: only integer arrays with one element can be converted to an index
How can I make a data structure which holds different multi-dimensional arrays and then traverse through it??
Making a list of arrays is not necessary when you're using numpy.
I suggest rewriting the whole function like this:
def Minimum_Close(shade_corrected_image, size):
uint32_shade_corrected_image = pymorph.to_int32(shade_corrected_image)
angles = np.arange(12) * 15
def pymorph_op(angle):
struct_element = pymorph.seline(size, angle)
return pymorph.close(uint32_shade_corrected_image, struct_element)
image_close = np.dstack(pymorph_op(a) for a in angles)
min_close_image = np.min(image_close, axis=-1) - shade_corrected_image
return min_close_image
I lower cased variable names so that they stop getting highlighted as classes.
What about:
for cnt,Cell in enumerate(Image_Close):
Temp_Cell = Image_Close[cnt]

Categories

Resources