I'm reading in data and trying to create a NumPy array of shape (194, 1). So it should look like: [[4], [0], [9], ...]
I'm doing this:
def parse_data(file_name):
data = []
target = []
with open(file_name) as f:
for line in f:
temp = line.split()
x = [float(x) for x in temp[:2]]
y = float(temp[2])
data.append(np.array(x))
target.append(np.array(y))
return np.array(data), np.array(target)
x, y = parse_data("data.txt")
when I inspect y.shape, it's (194,), not (194,1) as I expected.
The x has shape (194,2) as I'd expect, however.
Any idea what I'm doing incorrectly?
Thanks!
You seem to have expected np.array(y) to automatically turn your scalar y into a 1-element row. That's not how NumPy works.
np.array(y) is 0-dimensional. Putting a bunch of those in a list and calling array on the list produces a 1-dimensional result, not a 2-dimensional one.
When np.array() is called on a list of numpy arrays built from scalars it concatenates them and then creates a numpy array, giving you your (194,) shape.
You can always reshape y to your desired shape:
def parse_data(file_name):
data = []
target = []
with open(file_name) as f:
for line in f:
temp = line.split()
x = [float(x) for x in temp[:2]]
y = float(temp[2])
data.append(np.array(x))
target.append(y)
return np.array(data), np.array(target).reshape(-1, 1)
x, y = parse_data("data.txt")
Of course you can also fix your problem with:
target.append(np.array([y]))
An example of the behavior I stated above:
import numpy as np
a = np.array(5)
b = np.array(4)
v = [a, b]
v
>>>[array(5), array(4)]
np.array(v)
>>>array(5, 4) #arrays are concatenated
I'd skip the np.array in the iteration.
def parse_data(file_name):
data = []
target = []
with open(file_name) as f:
for line in f:
temp = line.split()
x = [float(x) for x in temp[:2]]
y = float(temp[2])
data.append(x)
target.append(y)
return np.array(data), np.array(target)
This would create data like:
[[1.0, 2.0],[3.0, 4.0], ....]
and target like
[1.2, 3.2, 3.1, ...]
np.array(data) then turns the list of lists into a 2d array, and the list of numbers into a 1d array.
It is then easy to reshape or add a dimension to the 1d, making it (1,n) or (n,1) or what ever you need.
Remember the basic array construction methods are:
np.array([1,2,3]) # 1d
np.array([[1,2],[3,4]]) # 2d
Related
I have a very large array, but I'll use a smaller one to explain.
Given source array X
X = [ [1,1,1,1],
[2,2,2,2],
[3,3,3,3]]
A target array with the same size Y
Y = [ [-1,-1,-1,-1],
[-2,-2,-2,-2],
[-3,-3,-3,-3]]
And an assigment array IDX:
IDX = [ [1,0,0,0],
[0,0,1,0],
[0,1,0,1]]
I want to assign Y to X by IDX - Only assign where IDX==1
In this case, something like:
X[IDX] = Y[IDX]
will result in:
X = [ [-1,1,1,1],
[2,2,-2,2],
[3,-3,3,-3]]
How can this be done efficiently (not a for-loop) in numpy/pandas?
Thx
If IDX is a NumPy array of Boolean type, and X and Y are NumPy arrays then your intuition works:
X = np.array(X)
Y = np.array(Y)
IDX = np.array(IDX).astype(bool)
X[IDX] = Y[IDX]
This changes X in place.
If you don't want to do all this type casting, or don't want to overwrite X, then np.where() does what you want in one go:
np.where(IDX==1, Y, X)
I am trying to get good at numpy and want to know if I can use values in exisiting arrays to serve as indices for a function that returns values for another array. I can do this:
def somefun(i):
return i+1
x = np.array([2, 4, 5])
k_labs = np.arange(100)
k_labs2 = k_labs[somefun(x[:])]
But how do I deal with using vectors in matrices in case x was a double array, where I just want to use one vector at a time as indices-arguments for a function, such as X[:, i], without using for-loops?
such as would be the case in:
x = np.array([[2, 4, 5],[7, 8, 9]])
def somefun(i):
return i+1
k_labs = np.arange(100)
k_labs2 = k_labs[somefun(x[:, i])]
EDIT ITERATION 2
To get the gist of what I am trying to accomplish see the code below. In the function pred as you can see i wanted to write the things I've commented out in a numpy fashion that might work better yet. I have some probelms though we the two lines I put in instead, since I get an error of wrong broadcast dimensions in the function called distance, at the the line where I try to assign the normalized vectors at a variable.
class kNN:
def __init__(self, X_train : np.array, label_train, val = None):
self.X = X_train#X[:-1, :]
self.labels = label_train#X[-1, :]
#self.k = k
self.kNN_4all = None #np.zeros(self.X.shape[1])
def distance(self, x1):
x1 = np.tile(x1, (self.X.shape[1], 1)) #creates a matrix of len of X with copyes of x1 vector for easy matrix subtraction.
dists = np.linalg.norm(x1 - self.X.T, axis = 1) #Flips to find linalg.norm for all the axis
return dists
def k_nearest(self, x_vec, k):
k_nearest = self.distance(x_vec)
k_nearest = np.argsort(k_nearest)[ :k]
kNN_labs = np.zeros(k_nearest.shape)
kNN_labs[:] = self.labels[k_nearest[:]]
unique, vote = np.unique(kNN_labs, return_counts=True)
return unique[np.argmax(vote)]
def pred(self, X_test, k):
self.kNN_4all = np.zeros(X_test.shape[1])
self.kNN_4all = self.k_nearest(X_test[:, :], k)
#for i in range(X_test.shape[1]):
# NewLabel = self.k_nearest(X_test[:, i], k) #defines x_vec in matrix X
# self.kNN_4all[i] = NewLabel
#return self.kNN_4all
def prec(self, labels_val):
elem_equal = (self.kNN_4all == labels_val).astype(int).flatten()
prec = np.sum(elem_equal)/elem_equal.shape
return 1 - prec[0]
X_train = X[:, :100]
labs_train = labs[:100]
pilot = kNN(X_train, labs_train)
pilot.pred(X[:,100:200], 10)
pilot.prec(labs[100:200])
I get the following error:
ValueError: operands could not be broadcast together with shapes (78400,100) (100,784)
As we can see from the code the k_nearest(self, x_vec, k) takes one 1D-subarray, so passing any full matrix X will cause the broad-casting error, since the functions within k_nearest relies on passing only a 1D subarray.
I don't know if it really is possible to avoid for loops in this regard and use numpy to increment through 1D subarrays as arguments for a function, such that each call of the function with the arguments can be assigned to a different cell in another array, in this case the self.kNN_4all
x = np.array([[2, 4, 5], [7, 8, 9], [33, 50, 71]])
x = x + 1
k_labs = np.arange(100)
ttt = k_labs[x]
print(ttt)
ttt creates an array that takes values from 'k_labs' based on pseudo-indexes 'x'. The array is accessed for example:
print(ttt[1])#[ 8 9 10]
If you want to refer to a certain value (for example, with indexes x[2]) alone, then the code will be as follows:
x = np.array([[2, 4, 5], [7, 8, 9], [33, 50, 71]])
x = x + 1
k_labs = np.arange(100)
print(k_labs[x[2]])
I meet a problem to convert a python matrix of torch.tensor to a torch.tensor
For example, M is an (n,m) matrix, with each element M[i][j] is a torch.tensor with same size (p, q, r, ...). How to convert python list of list M to a torch.tensor with size (n,m,p,q,r,...)
e.g.
M = []
for i in range(5):
row = []
for j in range(10):
row.append(torch.rand(3,4))
M.append(row)
How to convert above M to a torch.tensor with size (5,10,3,4).
Try torch.stack() to stack a list of tensors on the first dimension.
import torch
M = []
for i in range(5):
row = []
for j in range(10):
row.append(torch.rand(3,4))
row = torch.stack(row)
M.append(row)
M = torch.stack(M)
print(M.size())
# torch.Size([5, 10, 3, 4])
Try this.
ref = np.arange(3*4*5).reshape(3,4,5) # numpy array
values = [ref.copy()+i for i in range(6)] # List of numpy arrays
b = torch.from_numpy(np.array(values)) # torch-array from List of numpy arrays
References
Converting NumPy Array to Torch Tensor
The general solution to this question is being worked on in this github issue, but I was wondering if there are workarounds using tf.gather (or something else) to achieve array indexing using a multi-index. One solution I came up with was to broadcast multiply each index in the multi-idx with the cumulative product of the tensor shape, which produces indices suitable for indexing the flattened tensor:
import tensorflow as tf
import numpy as np
def __cumprod(l):
# Get the length and make a copy
ll = len(l)
l = [v for v in l]
# Reverse cumulative product
for i in range(ll-1):
l[ll-i-2] *= l[ll-i-1]
return l
def ravel_multi_index(tensor, multi_idx):
"""
Returns a tensor suitable for use as the index
on a gather operation on argument tensor.
"""
if not isinstance(tensor, (tf.Variable, tf.Tensor)):
raise TypeError('tensor should be a tf.Variable')
if not isinstance(multi_idx, list):
multi_idx = [multi_idx]
# Shape of the tensor in ints
shape = [i.value for i in tensor.get_shape()]
if len(shape) != len(multi_idx):
raise ValueError("Tensor rank is different "
"from the multi_idx length.")
# Work out the shape of each tensor in the multi_idx
idx_shape = [tuple(j.value for j in i.get_shape()) for i in multi_idx]
# Ensure that each multi_idx tensor is length 1
assert all(len(i) == 1 for i in idx_shape)
# Create a list of reshaped indices. New shape will be
# [1, 1, dim[0], 1] for the 3rd index in multi_idx
# for example.
reshaped_idx = [tf.reshape(idx, [1 if i !=j else dim[0]
for j in range(len(shape))])
for i, (idx, dim)
in enumerate(zip(multi_idx, idx_shape))]
# Figure out the base indices for each dimension
base = __cumprod(shape)
# Now multiply base indices by each reshaped index
# to produce the flat index
return (sum(b*s for b, s in zip(base[1:], reshaped_idx[:-1]))
+ reshaped_idx[-1])
# Shape and slice starts and sizes
shape = (Z, Y, X) = 4, 5, 6
Z0, Y0, X0 = 1, 1, 1
ZS, YS, XS = 3, 3, 4
# Numpy matrix and index
M = np.random.random(size=shape)
idx = [
np.arange(Z0, Z0+ZS).reshape(ZS,1,1),
np.arange(Y0, Y0+YS).reshape(1,YS,1),
np.arange(X0, X0+XS).reshape(1,1,XS),
]
# Tensorflow matrix and indices
TM = tf.Variable(M)
TF_flat_idx = ravel_multi_index(TM, [
tf.range(Z0, Z0+ZS),
tf.range(Y0, Y0+YS),
tf.range(X0, X0+XS)])
TF_data = tf.gather(tf.reshape(TM,[-1]), TF_flat_idx)
with tf.Session() as S:
S.run(tf.initialize_all_variables())
# Obtain data via flat indexing
data = S.run(TF_data)
# Check that it agrees with data obtained
# by numpy smart indexing
assert np.all(data == M[idx])
However, this only works on tensors of rank 3 due to this (current) limitation limiting broadcasts to tensors of rank 3.
At the moment I can only think of doing a chained gather, transpose, gather, transpose, gather, but this is unlikely to be efficient. e.g.
shape = (8, 9, 10)
A = tf.random_normal(shape)
data = tf.gather(tf.transpose(tf.gather(A, [1, 3]), [1,0,2]), ...)
Any ideas?
It sounds like you want gather_nd.
In my Python program I concatenate several integers and an array. It would be intuitive if this would work:
x,y,z = 1,2,np.array([3,3,3])
np.concatenate((x,y,z))
However, instead all ints have to be converted to np.arrays:
x,y,z = 1,2,np.array([3,3,3])
np.concatenate((np.array([x]),np.array([y]),z))
Especially if you have many variables this manual converting is tedious. The problem is that x and y are 0-dimensional arrays, while z is 1-dimensional. Is there any way to do the concatenation without the converting?
They just have to be sequence objects, not necessarily numpy arrays:
x,y,z = 1,2,np.array([3,3,3])
np.concatenate(([x],[y],z))
# array([1, 2, 3, 4, 5])
Numpy also does have an insert function that will do this:
x,y,z = 1,2,np.array([3,3,3])
np.insert(z, [0,0], [x, y])
I'll add that if you're just trying to add integers to an list, you don't need numpy to do it:
x,y,z = 1,2,[3,3,3]
z = [x] + [y] + z
or
x,y,z = 1,2,[3,3,3]
[x, y] + z
or
x,y,z = 1,2,[3,3,3]
z.insert(0, y)
z.insert(0, x)