I am attempting to run the following code, but am getting the following error:
line 71, in cross_validation
folds[index] = numpy.vstack((folds[index], dataset[jindex])). ValueError: could not broadcast input array from shape (2,8) into shape (8)
What is interesting is that when I print out the shapes of the two items I am trying to use in the vstack, they have the same shape (8,)
I am trying to determine why this line of the function is failing. Any advice would be greatly appreciated.
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def create_folds(dataset):
length = len(dataset)
folds = numpy.empty_like(dataset)
for index in range(5):
tempArray = numpy.ndarray(shape=(1, length))
numpy.append(folds, tempArray)
temp_class_array = numpy.ndarray(shape=(1,1))
numpy.append(folds, temp_class_array)
return folds
def class_distribution(dataset):
dataset = numpy.asarray(dataset)
num_total_rows = dataset.shape[0]
num_columns = dataset.shape[1]
classes = dataset[:,num_columns-1]
classes = numpy.unique(classes)
class_weights = []
for aclass in classes:
total = 0
weight = 0
for row in dataset:
if numpy.array_equal(aclass, row[-1]):
total = total + 1
else:
continue
weight = float((total/num_total_rows))
class_weights.append(weight)
class_weights = numpy.asarray(class_weights)
return classes, class_weights
def cross_validation(dataset):
classes, class_weights = class_distribution(dataset)
total_length = len(dataset)
folds = create_folds(dataset)
added_so_far = 0
for a_class, a_class_weight in zip(classes, class_weights):
amt_for_fold = float(((a_class_weight * total_length) / 5)-1)
for index in range(0,10,2):
added = 0
for jindex in range(len(classes)):
if added >= amt_for_fold:
break
if classes[jindex] == a_class:
print(folds[index].shape)
print(dataset[jindex].shape)
folds[index] = numpy.vstack((folds[index], dataset[jindex]))
# print(folds)
folds[index + 1] = numpy.vstack((folds[index + 1], [classes[jindex]]))
if index < 8:
dataset = numpy.delete(dataset, jindex, 0)
classes = numpy.delete(classes, jindex, 0)
added_so_far = added_so_far + 1
for xindex in range(len(folds)):
folds[xindex] = numpy.delete(folds[xindex], 0, 0)
print(folds)
return folds
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
cross_validation(ecoli)
main()
On the following dataset:
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
0.52,0.39,0.48,0.5,0.65,0.71,0.73,1
0.29,0.47,0.48,0.5,0.71,0.65,0.69,1
0.55,0.47,0.48,0.5,0.57,0.78,0.8,1
0.12,0.67,0.48,0.5,0.74,0.58,0.63,1
0.4,0.5,0.48,0.5,0.65,0.82,0.84,1
0.73,0.36,0.48,0.5,0.53,0.91,0.92,1
0.84,0.44,0.48,0.5,0.48,0.71,0.74,1
0.48,0.45,0.48,0.5,0.6,0.78,0.8,1
0.54,0.49,0.48,0.5,0.4,0.87,0.88,1
0.48,0.41,0.48,0.5,0.51,0.9,0.88,1
0.5,0.66,0.48,0.5,0.31,0.92,0.92,1
0.72,0.46,0.48,0.5,0.51,0.66,0.7,1
0.47,0.55,0.48,0.5,0.58,0.71,0.75,1
0.33,0.56,0.48,0.5,0.33,0.78,0.8,1
0.64,0.58,0.48,0.5,0.48,0.78,0.73,1
0.11,0.5,0.48,0.5,0.58,0.72,0.68,1
0.31,0.36,0.48,0.5,0.58,0.94,0.94,1
0.68,0.51,0.48,0.5,0.71,0.75,0.78,1
0.69,0.39,0.48,0.5,0.57,0.76,0.79,1
0.52,0.54,0.48,0.5,0.62,0.76,0.79,1
0.46,0.59,0.48,0.5,0.36,0.76,0.23,1
0.36,0.45,0.48,0.5,0.38,0.79,0.17,1
0,0.51,0.48,0.5,0.35,0.67,0.44,1
0.1,0.49,0.48,0.5,0.41,0.67,0.21,1
0.3,0.51,0.48,0.5,0.42,0.61,0.34,1
0.61,0.47,0.48,0.5,0,0.8,0.32,1
0.63,0.75,0.48,0.5,0.64,0.73,0.66,1
0.71,0.52,0.48,0.5,0.64,1,0.99,1
0.72,0.42,0.48,0.5,0.65,0.77,0.79,2
0.79,0.41,0.48,0.5,0.66,0.81,0.83,2
0.83,0.48,0.48,0.5,0.65,0.76,0.79,2
0.69,0.43,0.48,0.5,0.59,0.74,0.77,2
0.79,0.36,0.48,0.5,0.46,0.82,0.7,2
0.78,0.33,0.48,0.5,0.57,0.77,0.79,2
0.75,0.37,0.48,0.5,0.64,0.7,0.74,2
0.59,0.29,0.48,0.5,0.64,0.75,0.77,2
0.67,0.37,0.48,0.5,0.54,0.64,0.68,2
0.66,0.48,0.48,0.5,0.54,0.7,0.74,2
0.64,0.46,0.48,0.5,0.48,0.73,0.76,2
0.76,0.71,0.48,0.5,0.5,0.71,0.75,2
0.84,0.49,0.48,0.5,0.55,0.78,0.74,2
0.77,0.55,0.48,0.5,0.51,0.78,0.74,2
0.81,0.44,0.48,0.5,0.42,0.67,0.68,2
0.58,0.6,0.48,0.5,0.59,0.73,0.76,2
0.63,0.42,0.48,0.5,0.48,0.77,0.8,2
0.62,0.42,0.48,0.5,0.58,0.79,0.81,2
0.86,0.39,0.48,0.5,0.59,0.89,0.9,2
0.81,0.53,0.48,0.5,0.57,0.87,0.88,2
0.87,0.49,0.48,0.5,0.61,0.76,0.79,2
0.47,0.46,0.48,0.5,0.62,0.74,0.77,2
0.76,0.41,0.48,0.5,0.5,0.59,0.62,2
0.7,0.53,0.48,0.5,0.7,0.86,0.87,2
0.64,0.45,0.48,0.5,0.67,0.61,0.66,2
0.81,0.52,0.48,0.5,0.57,0.78,0.8,2
0.73,0.26,0.48,0.5,0.57,0.75,0.78,2
0.49,0.61,1,0.5,0.56,0.71,0.74,2
0.88,0.42,0.48,0.5,0.52,0.73,0.75,2
0.84,0.54,0.48,0.5,0.75,0.92,0.7,2
0.63,0.51,0.48,0.5,0.64,0.72,0.76,2
0.86,0.55,0.48,0.5,0.63,0.81,0.83,2
0.79,0.54,0.48,0.5,0.5,0.66,0.68,2
0.57,0.38,0.48,0.5,0.06,0.49,0.33,2
0.78,0.44,0.48,0.5,0.45,0.73,0.68,2
0.78,0.68,0.48,0.5,0.83,0.4,0.29,3
0.63,0.69,0.48,0.5,0.65,0.41,0.28,3
0.67,0.88,0.48,0.5,0.73,0.5,0.25,3
0.61,0.75,0.48,0.5,0.51,0.33,0.33,3
0.67,0.84,0.48,0.5,0.74,0.54,0.37,3
0.74,0.9,0.48,0.5,0.57,0.53,0.29,3
0.73,0.84,0.48,0.5,0.86,0.58,0.29,3
0.75,0.76,0.48,0.5,0.83,0.57,0.3,3
0.77,0.57,0.48,0.5,0.88,0.53,0.2,3
0.74,0.78,0.48,0.5,0.75,0.54,0.15,3
0.68,0.76,0.48,0.5,0.84,0.45,0.27,3
0.56,0.68,0.48,0.5,0.77,0.36,0.45,3
0.65,0.51,0.48,0.5,0.66,0.54,0.33,3
0.52,0.81,0.48,0.5,0.72,0.38,0.38,3
0.64,0.57,0.48,0.5,0.7,0.33,0.26,3
0.6,0.76,1,0.5,0.77,0.59,0.52,3
0.69,0.59,0.48,0.5,0.77,0.39,0.21,3
0.63,0.49,0.48,0.5,0.79,0.45,0.28,3
0.71,0.71,0.48,0.5,0.68,0.43,0.36,3
0.68,0.63,0.48,0.5,0.73,0.4,0.3,3
0.74,0.49,0.48,0.5,0.42,0.54,0.36,4
0.7,0.61,0.48,0.5,0.56,0.52,0.43,4
0.66,0.86,0.48,0.5,0.34,0.41,0.36,4
0.73,0.78,0.48,0.5,0.58,0.51,0.31,4
0.65,0.57,0.48,0.5,0.47,0.47,0.51,4
0.72,0.86,0.48,0.5,0.17,0.55,0.21,4
0.67,0.7,0.48,0.5,0.46,0.45,0.33,4
0.67,0.81,0.48,0.5,0.54,0.49,0.23,4
0.67,0.61,0.48,0.5,0.51,0.37,0.38,4
0.63,1,0.48,0.5,0.35,0.51,0.49,4
0.57,0.59,0.48,0.5,0.39,0.47,0.33,4
0.71,0.71,0.48,0.5,0.4,0.54,0.39,4
0.66,0.74,0.48,0.5,0.31,0.38,0.43,4
0.67,0.81,0.48,0.5,0.25,0.42,0.25,4
0.64,0.72,0.48,0.5,0.49,0.42,0.19,4
0.68,0.82,0.48,0.5,0.38,0.65,0.56,4
0.32,0.39,0.48,0.5,0.53,0.28,0.38,4
0.7,0.64,0.48,0.5,0.47,0.51,0.47,4
0.63,0.57,0.48,0.5,0.49,0.7,0.2,4
0.69,0.65,0.48,0.5,0.63,0.48,0.41,4
0.43,0.59,0.48,0.5,0.52,0.49,0.56,4
0.74,0.56,0.48,0.5,0.47,0.68,0.3,4
0.71,0.57,0.48,0.5,0.48,0.35,0.32,4
0.61,0.6,0.48,0.5,0.44,0.39,0.38,4
0.59,0.61,0.48,0.5,0.42,0.42,0.37,4
0.74,0.74,0.48,0.5,0.31,0.53,0.52,4
The vstack() is returning a shape (2,8) array.
You're then assigning that (2,8) array to the LHS folds[index], which is just a shape (8,) array.
numpy tries to see if such a mismatched assignment can be justified by broadcasting, subject to the rules and constraints of broadcasting, and is finally giving up, with that error message.
Not sure what your actual intent is, so I'm not able to suggest alternative.
My guess is that folds should actually be created as a 3d array, in which each inner 2d array has as many rows as the length of each fold.
I also have this suspicion that, the line folds = numpy.empty_like(dataset) is based on some wrong understanding of numpy.empty_like(). Please double-check that.
I think you might be misunderstanding what vstack does. Given two vectors with 8 items it will stack them vertically and you will get a 2x8 matrix. Indeed the output will always be at lead 2D. See doc and the examples in https://docs.scipy.org/doc/numpy/reference/generated/numpy.vstack.html
E.g.
a = np.array([1,2,3])
b = np.array([1,2,3])
np.vstack((a,b))
outputs
array([[1, 2, 3],
[1, 2, 3]])