Can someone help to fix this error: I am a beginner and finding it difficult to figure out how to fix it.
This is the error I am getting :
ValueError: Expected 2D array, got 1D array instead:
array=[ 282 561 837 ... 649442 649701 649957].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
class MyDataset(Dataset):
def __init__(self, patient_ids,bih2aami=True):
self.patient_ids = patient_ids # list of patients ID
#self.directory=""
self.nb_qrs = 99 #number of beats
self.idx_tuples = flatten([[(patient_idx, rpeak_idx) for rpeak_idx in range(self.nb_qrs)]
for patient_idx in range(len(patient_ids))])
self.bih2aami=bih2aami
def __len__(self):#returns the size of the data set.
return len(self.idx_tuples) # length of the dataset
def __getitem__(self, idx): # get one sample from the dataset
patient_idx, rpeak_idx = self.idx_tuples[idx]
patient_id = self.patient_ids[patient_idx]
file = self.directory + patient_id
signal, normal_qrs_pos = get_signal(file)
# Create a range of windows positions
if (idx//2 == idx/2):
qrs_pos = normal_qrs_pos[rpeak_idx]
else:
qrs_pos = normal_qrs_pos[rpeak_idx] + randint(-round(.25*fs),round(.25*fs))
#win_pos = normal_qrs_pos # FIND CORRECT WIN_POS FOR THIS patient
beat, label = extract_beat(signal,qrs_pos,normal_qrs_pos)
if (label == 1):
print("==== FOUND ONE MATCHING QRS === pos = ", qrs_pos)
else:
print("==== NO MATCH === pos = ", qrs_pos)
X, y = torch.tensor(beat).float(), torch.tensor(label).float()
print(y.size())
return X,y
The code for beat extraction
def extract_beat(signal, win_pos, qrs_positions, win_msec=40, fs=360, start_beat=36, end_beat=108):
"""
win_pos position at which you place the window of your beat
qrs_positions (list) the qrs indices from the annotations (read them from the atr file)-->obtained from annotation.sample
win_msec in milliseconds
"""
#extract signal
signal = np.array(signal)
#print(signal.shape)
#beat_array = np.zeros(start_beat+end_beat)#number of channels
start = int(max(win_pos-start_beat,0))
stop = start+start_beat+end_beat+1
#print(beat_array.shape,signal.shape)
beat = signal[start:stop]
#print(" =========== BEAT = ",len(beat))
#compute the nearest neighbor of win_pos among qrs_positions
tolerance = (fs*win_msec)//1000 #samples at a distance <tolerance are matched
nbr = NearestNeighbors(n_neighbors=1).fit(qrs_positions)
distances, indices = nbr.kneighbors(np.array([[win_pos]]).reshape(-1,1))
#label
if distances[0][0] <= tolerance:
label = 1
else:
label = 0
print(distances[0],tolerance,label)
return beat, label
As sklearn docs says in: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors.fit
You should send a 2d array ( of shape (n_samples, n_features) ) to fit method.
And as your error write you can just reshape the array use:
#compute the nearest neighbor of win_pos among qrs_positions
colerance = (fs*win_msec)//1000 #samples at a distance <tolerance are matched
nbr = NearestNeighbors(n_neighbors=1).fit(qrs_positions.reshape(-1,1))
distances, indices = nbr.kneighbors(np.array([[win_pos]]).reshape(-1,1))
Related
I have two sets of satellite data. For both sets, I have the pixel geometry (latitude and longitude of each corner of the pixel). I would like to regrid one set to the other. Thus, my goal is area-weighted regridding from an irregular grid to another irregular grid. I am aware of xESMF, but am unsure if that is the best tool for the job. Perhaps iris area weighting regrid would be appropriate?
I've ran into similar things in the past. I'm on Windows, and xEMSF wasn't really an option for me.
I've written this package, and added some methods for computing grid to grid weights:
https://github.com/Deltares/numba_celltree
(You can pip install it.)
The data structure can deal with fully unstructured 2D meshes, and expects the data in such a format. See the code below.
You will need to make some changes: your coordinates aren't named x and y most likely. You will also need to update the ugrid2d_topology function somewhat, since I'm assuming regular quadrilateral grids here (but they're irregular when seen in each others coordinate system).
It's still pretty straightforward, just make sure you have 2D array of vertices, and a face_node_connectivity array of shape (n_cell, 4) which maps for every face its four vertices. See this documention for a little more background:
https://ugrid-conventions.github.io/ugrid-conventions/
import numpy as np
import pandas as pd
import pyproj
import xarray as xr
from numba_celltree import CellTree2d
FloatArray = np.ndarray
IntArray = np.ndarray
def _coord(da, dim):
"""
Transform N xarray midpoints into N + 1 vertex edges
"""
delta_dim = "d" + dim # e.g. dx, dy, dz, etc.
# If empty array, return empty
if da[dim].size == 0:
return np.array(())
if delta_dim in da.coords: # equidistant or non-equidistant
dx = da[delta_dim].values
if dx.shape == () or dx.shape == (1,): # scalar -> equidistant
dxs = np.full(da[dim].size, dx)
else: # array -> non-equidistant
dxs = dx
_check_monotonic(dxs, dim)
else: # undefined -> equidistant
if da[dim].size == 1:
raise ValueError(
f"DataArray has size 1 along {dim}, so cellsize must be provided"
" as a coordinate."
)
dxs = np.diff(da[dim].values)
dx = dxs[0]
atolx = abs(1.0e-4 * dx)
if not np.allclose(dxs, dx, atolx):
raise ValueError(
f"DataArray has to be equidistant along {dim}, or cellsizes"
" must be provided as a coordinate."
)
dxs = np.full(da[dim].size, dx)
dxs = np.abs(dxs)
x = da[dim].values
if not da.indexes[dim].is_monotonic_increasing:
x = x[::-1]
dxs = dxs[::-1]
# This assumes the coordinate to be monotonic increasing
x0 = x[0] - 0.5 * dxs[0]
x = np.full(dxs.size + 1, x0)
x[1:] += np.cumsum(dxs)
return x
def _ugrid2d_dataset(
node_x: FloatArray,
node_y: FloatArray,
face_x: FloatArray,
face_y: FloatArray,
face_nodes: IntArray,
) -> xr.Dataset:
ds = xr.Dataset()
ds["mesh2d"] = xr.DataArray(
data=0,
attrs={
"cf_role": "mesh_topology",
"long_name": "Topology data of 2D mesh",
"topology_dimension": 2,
"node_coordinates": "node_x node_y",
"face_node_connectivity": "face_nodes",
"edge_node_connectivity": "edge_nodes",
},
)
ds = ds.assign_coords(
node_x=xr.DataArray(
data=node_x,
dims=["node"],
)
)
ds = ds.assign_coords(
node_y=xr.DataArray(
data=node_y,
dims=["node"],
)
)
ds["face_nodes"] = xr.DataArray(
data=face_nodes,
coords={
"face_x": ("face", face_x),
"face_y": ("face", face_y),
},
dims=["face", "nmax_face"],
attrs={
"cf_role": "face_node_connectivity",
"long_name": "Vertex nodes of mesh faces (counterclockwise)",
"start_index": 0,
"_FillValue": -1,
},
)
ds.attrs = {"Conventions": "CF-1.8 UGRID-1.0"}
return ds
def ugrid2d_topology(data: Union[xr.DataArray, xr.Dataset]) -> xr.Dataset:
"""
Derive the 2D-UGRID quadrilateral mesh topology from a structured DataArray
or Dataset, with (2D-dimensions) "y" and "x".
Parameters
----------
data: Union[xr.DataArray, xr.Dataset]
Structured data from which the "x" and "y" coordinate will be used to
define the UGRID-2D topology.
Returns
-------
ugrid_topology: xr.Dataset
Dataset with the required arrays describing 2D unstructured topology:
node_x, node_y, face_x, face_y, face_nodes (connectivity).
"""
# Transform midpoints into vertices
# These are always returned monotonically increasing
x = data["x"].values
xcoord = _coord(data, "x")
if not data.indexes["x"].is_monotonic_increasing:
xcoord = xcoord[::-1]
y = data["y"].values
ycoord = _coord(data, "y")
if not data.indexes["y"].is_monotonic_increasing:
ycoord = ycoord[::-1]
# Compute all vertices, these are the ugrid nodes
node_y, node_x = (a.ravel() for a in np.meshgrid(ycoord, xcoord, indexing="ij"))
face_y, face_x = (a.ravel() for a in np.meshgrid(y, x, indexing="ij"))
linear_index = np.arange(node_x.size, dtype=np.int32).reshape(
ycoord.size, xcoord.size
)
# Allocate face_node_connectivity
nfaces = (ycoord.size - 1) * (xcoord.size - 1)
face_nodes = np.empty((nfaces, 4))
# Set connectivity in counterclockwise manner
face_nodes[:, 0] = linear_index[:-1, 1:].ravel() # upper right
face_nodes[:, 1] = linear_index[:-1, :-1].ravel() # upper left
face_nodes[:, 2] = linear_index[1:, :-1].ravel() # lower left
face_nodes[:, 3] = linear_index[1:, 1:].ravel() # lower right
# Tie it together
ds = _ugrid2d_dataset(node_x, node_y, face_x, face_y, face_nodes)
return ds
def area_weighted_mean(
da: xr.DataArray,
destination_index: np.ndarray,
source_index: np.ndarray,
weights: np.ndarray,
):
"""
Area weighted mean.
Parameters
----------
da: xr.DataArray
Contains source data.
destination_index: np.ndarray
In which destination the overlap is located.
source_index: np.ndarray
In which source cell the overlap is located.
weights: np.ndarray
Area of each overlap.
Returns
-------
destination_index: np.ndarray
values: np.ndarray
"""
values = da.data.ravel()[source_index]
df = pd.DataFrame(
{"dst": destination_index, "area": weights, "av": weights * values}
)
aggregated = df.groupby("dst").sum("sum", min_count=1)
out = aggregated["av"] / aggregated["area"]
return out.index.values, out.values
class Regridder:
"""
Regridder to reproject and/or regrid rasters. When no ``crs_source`` and
``crs_destination`` are provided, it is assumed that ``source`` and
``destination`` share the same coordinate system.
Note that an area weighted regridding method only makes sense for projected
(Cartesian!) coordinate systems.
Parameters
----------
source: xr.DataArray
Source example. Must have dimensions ("y", "x").
destination: xr.DataArray
Destination example. Must have dimensions ("y", "x").
crs_source: optional, default: None
crs_destination: optional, default: None
"""
def __init__(
self,
source: xr.DataArray,
destination: xr.DataArray,
crs_source=None,
crs_destination=None,
):
src = ugrid2d_topology(source)
dst = ugrid2d_topology(destination)
src_yy = src["node_y"].values
src_xx = src["node_x"].values
if crs_source and crs_destination:
transformer = pyproj.Transformer.from_crs(
crs_from=crs_source, crs_to=crs_destination, always_xy=True
)
src_xx, src_yy = transformer.transform(xx=src_xx, yy=src_yy)
elif crs_source ^ crs_destination:
raise ValueError("Received only one of (crs_source, crs_destination)")
src_vertices = np.column_stack([src_xx, src_yy])
src_faces = src["face_nodes"].values.astype(int)
dst_vertices = np.column_stack((dst["node_x"].values, dst["node_y"].values))
dst_faces = dst["face_nodes"].values
celltree = CellTree2d(src_vertices, src_faces, fill_value=-1)
self.source = source.copy()
self.destination = destination.copy()
(
self.destination_index,
self.source_index,
self.weights,
) = celltree.intersect_faces(
dst_vertices,
dst_faces,
fill_value=-1,
)
def regrid(self, da: xr.DataArray, fill_value=np.nan):
"""
Parameters
----------
da: xr.DataArray
Data to regrid.
fill_value: optional, default: np.nan
Default value of the output grid, e.g. where no overlap occurs.
Returns
-------
regridded: xr.DataArray
Data of da, regridded using an area weighted mean.
"""
src = self.source
if not (np.allclose(da["y"], src["y"]) and np.allclose(da["x"], src["x"])):
raise ValueError("da does not match source")
index, values = area_weighted_mean(
da,
self.destination_index,
self.source_index,
self.weights,
)
data = np.full(self.destination.shape, fill_value)
data.ravel()[index] = values
out = self.destination.copy(data=data)
out.name = da.name
return out
# Example use
da = xr.open_dataarray("gw_abstraction_sum.nc")
like = xr.open_dataarray("example.nc")
regridder = Regridder(
source=da, destination=like, crs_source=4326, crs_destination=3035
)
result = regridder.regrid(da)
result.to_netcdf("area-weighted_sum.nc")
I'm struggling in creating a data generator in PyTorch to extract 2D images from many 3D cubes saved in .dat format
There is a total of 200 3D cubes each having a 128*128*128 shape. Now I want to extract 2D images from all of these cubes along length and breadth.
For example, a is a cube having size 128*128*128
So I want to extract all 2D images along length i.e., [:, i, :] which will get me 128 2D images along the length, and similarly i want to extract along width i.e., [:, :, i], which will give me 128 2D images along the width. So therefore i get a total of 256 2D images from 1 3D cube, and i want to repeat this whole process for all 200 cubes, there by giving me 51200 2D images.
So far I've tried a very basic implementation which is working fine but is taking approximately 10 minutes to run. I want you guys to help me create a more optimal implementation keeping in mind time and space complexity. Right now my current approach has a time complexity of O(n2), can we dec it further to reduce the time complexity
I'm providing below the current implementation
from os.path import join as pjoin
import torch
import numpy as np
import os
from tqdm import tqdm
from torch.utils import data
class DataGenerator(data.Dataset):
def __init__(self, is_transform=True, augmentations=None):
self.is_transform = is_transform
self.augmentations = augmentations
self.dim = (128, 128, 128)
seismicSections = [] #Input
faultSections = [] #Ground Truth
for fileName in tqdm(os.listdir(pjoin('train', 'seis')), total = len(os.listdir(pjoin('train', 'seis')))):
unrolledVolSeismic = np.fromfile(pjoin('train', 'seis', fileName), dtype = np.single) #dat file contains unrolled cube, we need to reshape it
reshapedVolSeismic = np.transpose(unrolledVolSeismic.reshape(self.dim)) #need to transpose the axis to get height axis at axis = 0, while length (axis = 1), and width(axis = 2)
unrolledVolFault = np.fromfile(pjoin('train', 'fault', fileName),dtype=np.single)
reshapedVolFault = np.transpose(unrolledVolFault.reshape(self.dim))
for idx in range(reshapedVolSeismic.shape[2]):
seismicSections.append(reshapedVolSeismic[:, :, idx])
faultSections.append(reshapedVolFault[:, :, idx])
for idx in range(reshapedVolSeismic.shape[1]):
seismicSections.append(reshapedVolSeismic[:, idx, :])
faultSections.append(reshapedVolFault[:, idx, :])
self.seismicSections = seismicSections
self.faultSections = faultSections
def __len__(self):
return len(self.seismicSections)
def __getitem__(self, index):
X = self.seismicSections[index]
Y = self.faultSections[index]
return X, Y
Please Help!!!
why not storing only the 3D data in mem, and let the __getitem__ method "slice" it on the fly?
class CachedVolumeDataset(Dataset):
def __init__(self, ...):
super(...)
self._volumes_x = # a list of 200 128x128x128 volumes
self._volumes_y = # a list of 200 128x128x128 volumes
def __len__(self):
return len(self._volumes_x) * (128 + 128)
def __getitem__(self, index):
# extract volume index from general index:
vidx = index // (128 + 128)
# extract slice index
sidx = index % (128 + 128)
if sidx < 128:
# first dim
x = self._volumes_x[vidx][:, :, sidx]
y = self._volumes_y[vidx][:, :, sidx]
else:
sidx -= 128
# second dim
x = self._volumes_x[vidx][:, sidx, :]
y = self._volumes_y[vidx][:, sidx, :]
return torch.squeeze(x), torch.squeeze(y)
I'm struggeling in reshaping my image. Which is of dimension (100,100,3). The total array for all images makes up (3267, 100, 3)
def get_batch(batch_size,s="train"):
"""Create batch of n pairs, half same class, half different class"""
if s == 'train':
X = Xtrain
X= X.reshape(-1,100,100,3)
#X= X.reshape(-1,20,105,105)
categories = train_classes
else:
X = Xval
X= X.reshape(-1,100,100,3)
categories = val_classes
n_classes, n_examples, w, h, chan = X.shape
print(n_classes)
print(type(n_classes))
print(n_classes.shape)
# randomly sample several classes to use in the batch
categories = rng.choice(n_classes,size=(batch_size,),replace=False)
# initialize 2 empty arrays for the input image batch
pairs=[np.zeros((batch_size, h, w,1)) for i in range(2)]
# initialize vector for the targets
targets=np.zeros((batch_size,))
# make one half of it '1's, so 2nd half of batch has same class
targets[batch_size//2:] = 1
for i in range(batch_size):
category = categories[i]
idx_1 = rng.randint(0, n_examples)
pairs[0][i,:,:,:] = X[category, idx_1].reshape(w, h, chan)
idx_2 = rng.randint(0, n_examples)
# pick images of same class for 1st half, different for 2nd
if i >= batch_size // 2:
category_2 = category
else:
# add a random number to the category modulo n classes to ensure 2nd image has a different category
category_2 = (category + rng.randint(1,n_classes)) % n_classes
pairs[1][i,:,:,:] = X[category_2,idx_2].reshape(w, h,1)
return pairs, targets
However when trying to reshape the array pairs[0][i,:,:,:] = X[category, idx_1].reshape(w, h, chan) I always obtain the error that an array size of 300 is not reshapable into (100,100,3). I honestly don't see the problem why it should be...
Can anybody help me out?
you want array of 300 into 100,100,3. it cannot be because (100*100*3)=30000 and 30000 not equal to 300 you can only reshape if output shape has same number of values as input.
i suggest you should do (10,10,3) instead because (10*10*3)=300
Given an Abaqus odb-file including a node set (e.g. 'ALL_SECS').
NODAL-quantities like coordinates ('COORD') or displacement ('U') can be extracted at the nodes of the node set by the following pattern:
select step, frame and fieldoutput (e.g. 'COORD', 'U')
getSubset(region=) of the fieldoutput
get attributes of the resulting values
How can INTEGRATION_POINT-quantities be extracted / interpolated at nodes of the node set?
How can fieldoutput at NODAL-position be requested using abaqus-python?
from odbAccess import *
import numpy as np
# Helper function
def values_to_array(values, dim=2, item='data'):
length = len(values)
array = np.zeros((length, dim), dtype='float64')
for index in range(length):
array[index, :] = getattr(values[index], item)
return array
# Prepare and open
odb = openOdb(path='job.odb') # Solution of 2D-plane-stress model
instances = odb.rootAssembly.instances
instance = instances['PART']
sett = instance.nodeSets['ALL_SECS']
step = odb.steps.keys()[-1]
# Get coordinates and number of nodes in node set
frame = odb.steps[step].frames[-1]
values_xy = frame.fieldOutputs['COORD'].getSubset(region=sett).values
xy = values_to_array(values=values_xy, dim=2, item='dataDouble')
nbr_xy = len(values_xy)
print('len(values_xy)')
print(len(values_xy))
# Get nodal-quantity and number of nodes in node set
uvw = np.zeros((nbr_xy, 2), dtype=float)
outp = odb.steps[step].frames[-1].fieldOutputs['U']
values_u = outp.getSubset(region=sett).values
uvw = values_to_array(values=values_u, dim=2, item='dataDouble')
print('len(values_u)')
print(len(values_u))
eps = np.zeros((nbr_xy, 4), dtype=float)
outp = odb.steps[step].frames[-1].fieldOutputs['E']
values_eps = outp.getSubset(position=ELEMENT_NODAL, region=sett).values
# values_eps = outp.getSubset(position=ELEMENT_NODAL).getSubset(region=sett).values
print('len(values_eps)')
print(len(values_eps))
values_eps_nodal = outp.getSubset(position=NODAL, region=sett).values
print('len(values_eps_nodal)')
print(len(values_eps_nodal))
Output:
len(values_xy)
147
len(values_u)
147
len(values_eps)
408
len(values_eps_nodal)
0
The following solution is a workaround to get total strain (Fieldoutput 'E') at nodes, specified in the node set 'ALL_SECS'. As the order of the extracted nodes is not known, location information, i.e. coordinates of the nodes, is extracted as well.
The i-th strain in eps is the strain at the i-th coordinate in xy.
This feature seems not to exist in the Abaqus API.
Node-specific data, like displacements, can easily be extracted, see uv.
Key steps to extract strain data at element nodes and location:
Identify coordinates
Identify mapping nodeLabel -> index
Combine values at nodes, extrapolated from different elements using moving average. (See link for explanations)
Note: 2D model odb
from odbAccess import *
import numpy as np
import pickle
from operator import attrgetter
def values_to_array(values, dim=2, item='data', dtype=np.float64):
'''Thanks to https://stackoverflow.com/a/46925902/8935243'''
array = np.array(
map(attrgetter(item), values),
dtype=dtype,
)
return array
def values_to_index_mapping(values, item='nodeLabel', check=True):
node_labels = values_to_array(values, dim=1, item=item, dtype=np.int64)
if check:
assert len(set(node_labels)) == len(node_labels)
mapping = {}
for index, label in enumerate(node_labels):
mapping[label] = index
return mapping
odb = openOdb(path='job.odb')
instances = odb.rootAssembly.instances
instance = instances['PART']
sett = instance.nodeSets['ALL_SECS']
step = odb.steps.keys()[-1]
# Coordinates
frame = odb.steps[step].frames[-1]
values = frame.fieldOutputs['COORD'].getSubset(region=sett).values
xy = values_to_array(values=values, dim=2, item='data')
# Dimensions
nbr_xy = len(values)
# Mapping: nodeLabel -> index
index_map = values_to_index_mapping(values=values, check=True)
# Displacements
uv = np.zeros((nbr_xy, 2), dtype=float)
outp = odb.steps[step].frames[-1].fieldOutputs['U']
values = outp.getSubset(region=sett).values
uv[:, :] = values_to_array(values=values, dim=2, item='data')
# Strains
eps = np.zeros((nbr_xy, 4), dtype=float)
tmp = np.zeros((nbr_xy, 1), dtype=float)
values_eps = odb.steps[step].frames[-1].fieldOutputs['E'].getSubset(
position=ELEMENT_NODAL,
region=sett,
).values
# Moving average, as ELEMENT_NODAL does no averaging
# and returns multiple values for nodes in sett
for ee in values_eps:
index = index_map[ee.nodeLabel]
tmp[index] += 1
eps[index] = (eps[index] * (tmp[index] - 1) + ee.data) / tmp[index]
odb.close()
I am new to face recognition. I am trying to do face recognition with the help of bytefish facerec framework. It works fine but results are not very accurate. Therefore, I want to put threshold. As per suggested on his page (https://github.com/bytefish/facerec), I should be able to do it. However, explanation on page isnt very clear. So heres what I am doing.
My classifier
def predict(self, q):
distances = []
for xi in self.X:
xi = xi.reshape(-1,1)
d = self.dist_metric(xi, q)
distances.append(d)
if len(distances) > len(self.y):
raise Exception("More distances than classes. Is your distance metric correct?")
distances = np.asarray(distances)
# Get the indices in an ascending sort order:
idx = np.argsort(distances)
# Sort the labels and distances accordingly:
sorted_y = self.y[idx]
sorted_distances = distances[idx]
# Take only the k first items:
sorted_y = sorted_y[0:self.k]
sorted_distances = sorted_distances[0:self.k]
# Make a histogram of them:
hist = dict((key,val) for key, val in enumerate(np.bincount(sorted_y)) if val)
# And get the bin with the maximum frequency:
predicted_label = max(hist.iteritems(), key=op.itemgetter(1))[0]
# A classifier should output a list with the label as first item and
# generic data behind. The k-nearest neighbor classifier outputs the
# distance of the k first items. So imagine you have a 1-NN and you
# want to perform a threshold against it, you should take the first
# item
return [predicted_label, { 'labels' : sorted_y, 'distances' : sorted_distances }]
My model
def predict(self, X):
q = self.feature.extract(X)
return self.classifier.predict(q)
My server.py which generates the output
def get_prediction(image_data):
image = preprocess_image(image_data)
prediction = model.predict(image)
predicted_label = prediction[0]
classifier_output = prediction[1]
distance = classifier_output['distances'][0]
#distance = classifier.predict(self, q)
#distance = 11
if distance > 10.0:
return "nonsense"
else:
print prediction
So the problem is I am not able to get the distance here. Please help
after a while I was able to solve the problem. Threshold should be done in classifier file not in server.py.
Solution
distances = []
for xi in self.X:
xi = xi.reshape(-1,1)
d = self.dist_metric(xi, q)
distances.append(d)
if len(distances) > len(self.y):
raise Exception("More distances than classes. Is your distance metric correct?")
distances = np.asarray(distances)
# Get the indices in an ascending sort order:
idx = np.argsort(distances)
# Sort the labels and distances accordingly:
sorted_y = self.y[idx]
sorted_distances = distances[idx]
# Take only the k first items:
sorted_y = sorted_y[0:self.k]
sorted_distances = sorted_distances[0:self.k]
#sorted_distances = 1134.04873217
# Make a histogram of them:
hist = dict((key,val) for key, val in enumerate(np.bincount(sorted_y)) if val)
# And get the bin with the maximum frequency:
predicted_label = max(hist.iteritems(), key=op.itemgetter(1))[0]
# A classifier should output a list with the label as first item and
# generic data behind. The k-nearest neighbor classifier outputs the
#global unknown
if sorted_distances > 1800 :
return [predicted_label]
else:
return [predicted_label]