efficient way to convert a nested list to numpy array - python

I have a nested list with different list sized and types.
def read(f,tree,objects):
Event=[]
for o in objects:
#find different features of one class
temp=[i.GetName() for i in tree.GetListOfBranches() if i.GetName().startswith(o)]
tempList=[] #contains one class of objects
for t in temp:
#print t
tempList.append(t)
comp=np.asarray(getattr(tree,t))
tempList.append(comp)
Event.append(tempList)
return Event
def main():
path="path/to/file"
objects= ['TauJet', 'Jet', 'Electron', 'Muon', 'Photon', 'Tracks', 'ETmis', 'CaloTower']
f=ROOT.TFile(path)
tree=f.Get("RecoTree")
tree.GetEntry(100)
event=read(f,tree,objects)
for example result of event[0] is
['TauJet', array(1), 'TauJet_E', array([ 31.24074173]), 'TauJet_Px', array([-28.27997971]), 'TauJet_Py', array([-13.18042469]), 'TauJet_Pz', array([-1.08304048]), 'TauJet_Eta', array([-0.03470514]), 'TauJet_Phi', array([-2.70545626]), 'TauJet_PT', array([ 31.20065498]), 'TauJet_Charge', array([ 1.]), 'TauJet_NTracks', array([3]), 'TauJet_EHoverEE', array([ 1745.89221191]), 'TauJet_size', array(1)]
how can I convert it into numpy array?
NOTE 1: np.asarray(event, "object") is slow. I am looking for a better way. Also np.fromiter() is not applicable as far as I don't have a fixed type
NOTE 2: I don't know the length of my Events.
NOTE 3: I can also get ride of names if it makes thing easier.

You could try something like this, I'm not sure how fast its going to be though. This creates a numpy record array for first row.
data = event[0]
keys = data[0::2]
vals = data[1::2]
#there are some zero-rank arrays in there, so need to check for those,
#but I think just recasting them to a np.float should work.
temp = [np.float(v) for v in vals]
#you could also just create a np array from the line above with np.array(temp)
dtype={"names":keys, "formats":("f4")*len(vals)}
myArr = np.rec.fromarrays(temp, dtype=dtype)
#test it out
In [53]: data["TauJet_Pz"]
Out[53]: array(-1.0830404758453369, dtype=float32)
#alternatively, you could try something like this, which just creates a 2d numpy array
vals = np.array([[np.float(v) for v in row[1::2]] for row in event])
#now create a nice record array from that using the dtypes above
myRecordArray = np.rec.fromarrays(vals, dtype=dtype)

Related

Random Macro-nutrient selection (Python)

I am currently attempting to build a code that randomly selects food items from a table (which have a macro nutrient breakdown).
What i would like to know is how do i tell Python "Print the index of the food you randomly selected
as a list"?
Assume our input looks like:
import numpy as np
macro_nutrients = [
'carbohydrates',
'fats',
'dietary_fiber',
'minerals',
'proteins',
'vitamins',
'water'
]
You have several options:
If your macro-nutrients are stored in a list-like structure, you can do:
el = np.random.choice(macro_nutrients)
idx = macro_nutrients.index(el)
print(el, "; Is the index correct?:", el == macro_nutrients[idx])
# or you can just write:
idx = np.random.randint(0, len(macro_nutrients) - 1)
print(macro_nutrients[idx])
For [].index() you can check this SO answer for caveats.
If you have a table-like structure (e.g. numpy 2d array):
# we will simulate it by permuting the above list several times and adding the
# permutation as a row in the new 2d array:
mat = np.array([np.random.permutation(macro_nutrients.copy()),
np.random.permutation(macro_nutrients.copy()),
np.random.permutation(macro_nutrients.copy()),
np.random.permutation(macro_nutrients.copy())])
# flatten() will convert your table back to 1d array
np.random.choice(mat.flatten())
# otherwise, you can use something like:
row = np.random.randint(0, mat.shape[0] - 1)
col = np.random.randint(0, mat.shape[1] - 1)
print(mat[row, col])

Sort numpy string array using positional data

I have a numpy array of strings
names = array([
'p00x00', 'p01x00', 'p02x00', 'p03x00', 'p04x00', 'p05x00',
'p00x01', 'p01x01', 'p02x01', 'p03x01', 'p04x01', 'p05x01',
'p00x02', 'p01x02', 'p02x02', 'p03x02', 'p04x02', 'p05x02',
'p00x03', 'p01x03', 'p02x03', 'p03x03', 'p04x03', 'p05x03',
'p00x04', 'p01x04', 'p02x04', 'p03x04', 'p04x04', 'p05x04',
'p00x05', 'p01x05', 'p02x05', 'p03x05', 'p04x05', 'p05x05'])
And corresponding position data
X = array([2.102235, 2.094113, 2.086038, 2.077963, 2.069849, 2.061699])
Y = array([-7.788431, -7.780364, -7.772306, -7.764247, -7.756188, -7.748114])
How can I sort names using X and Y such that I get out a sorted grid of names with shape (6, 6)? Note that there are essentially 6 unique X and Y positions -- I'm not just arbitrarily choosing 6x6.
names = array([
['p00x00', 'p01x00', 'p02x00', 'p03x00', 'p04x00', 'p05x00'],
['p00x01', 'p01x01', 'p02x01', 'p03x01', 'p04x01', 'p05x01'],
['p00x02', 'p01x02', 'p02x02', 'p03x02', 'p04x02', 'p05x02'],
['p00x03', 'p01x03', 'p02x03', 'p03x03', 'p04x03', 'p05x03'],
['p00x04', 'p01x04', 'p02x04', 'p03x04', 'p04x04', 'p05x04'],
['p00x05', 'p01x05', 'p02x05', 'p03x05', 'p04x05', 'p05x05']])
I realize in this case that I could simply reshape the array, but in general the data will not work out this neatly.
You can use numpy.argsort to get the indexes of the elements of an array after it's sorted. These indices you can then use to sort your names array.
import numpy as np
names = np.array([
'p00x00', 'p01x00', 'p02x00', 'p03x00', 'p04x00', 'p05x00',
'p00x01', 'p01x01', 'p02x01', 'p03x01', 'p04x01', 'p05x01',
'p00x02', 'p01x02', 'p02x02', 'p03x02', 'p04x02', 'p05x02',
'p00x03', 'p01x03', 'p02x03', 'p03x03', 'p04x03', 'p05x03',
'p00x04', 'p01x04', 'p02x04', 'p03x04', 'p04x04', 'p05x04',
'p00x05', 'p01x05', 'p02x05', 'p03x05', 'p04x05', 'p05x05'])
X = np.array([2.102235, 2.094113, 2.086038, 2.077963, 2.069849, 2.061699])
Y = np.array([-7.788431, -7.780364, -7.772306, -7.764247, -7.756188, -7.748114])
x_order = np.argsort(X)
y_order = np.argsort(Y)
names_ordered = names.reshape(6,6)[np.meshgrid(x_order,y_order)]
print(names_ordered)
gives the following output:
[['p00x05' 'p00x04' 'p00x03' 'p00x02' 'p00x01' 'p00x00']
['p01x05' 'p01x04' 'p01x03' 'p01x02' 'p01x01' 'p01x00']
['p02x05' 'p02x04' 'p02x03' 'p02x02' 'p02x01' 'p02x00']
['p03x05' 'p03x04' 'p03x03' 'p03x02' 'p03x01' 'p03x00']
['p04x05' 'p04x04' 'p04x03' 'p04x02' 'p04x01' 'p04x00']
['p05x05' 'p05x04' 'p05x03' 'p05x02' 'p05x01' 'p05x00']]

efficiently create dask.array from a dask.Series of lists

What is the most efficient way to create a dask.array from a dask.Series of list?
The series consists of 5 million lists 300 of elements.
It is currently divide into 500 partitions.
Currently I am trying:
pt = [delayed(np.array)(y)
for y in
[delayed(list)(x)
for x in series.to_delayed()]]
da = delayed(dask.array.concatenate)(pt, axis=1)
da = dask.array.from_delayed(da, (vec.size.compute(), 300), dtype=float)
The idea is to convert each partition into a numpy array and stitch
those together into a dask.array.
This code is taking forever to run though.
A numpy array can be built from this data quite quickly from this data sequentially as long as there is enough RAM.
I think that you are on the right track using dask.delayed. However calling list on the series is probably not ideal. I would create a function that converts one of your series into a numpy array and then go through delayed with that.
def convert_series_to_array(pandas_series): # make this as fast as you can
...
return numpy_array
L = dask_series.to_delayed()
L = [delayed(convert_series_to_array)(x) for x in L]
arrays = [da.from_delayed(x, shape=(np.nan, 300), dtype=...) for x in L]
x = da.concatenate(arrays, axis=0)
Also, regarding this line:
da = delayed(dask.array.concatenate)(pt, axis=1)
You should never call delayed on a dask function. They are already lazy.
Looking at this with some dummy data. Building on #MRocklin's answer (and molding more after my specific use case), let's say that your vectors are actually list of ints instead of floats and the list is stored as a string. We take the series, transform it, and store it in a zarr array file.
# create dummy data
vectors = [ np.random.randint(low=0,high=100,size=300).tolist() for _ in range(1000) ]
df = pd.DataFrame()
df['vector'] = vectors
df['vector'] = df['vector'].map(lambda x:f"{x}")
df['foo'] = 'bar'
ddf = dd.from_pandas( df, npartitions=100 )
# transform series data to numpy array
def convert_series_to_array( series ): # make this as fast as you can
series_ = [ast.literal_eval( i ) for i in series]
return np.stack(series_, axis=0)
L = ddf['vector'].to_delayed()
L = [delayed(convert_series_to_array)(x) for x in L]
arrays = [da.from_delayed(x, shape=(np.nan, 300), dtype=np.int64) for x in L]
x = da.concatenate(arrays, axis=0)
# store result into a zarr array
x.compute_chunk_sizes().to_zarr( 'toy_dataset.zarr', '/home/user/Documents/', overwrite=True )

dynamically append N-dimensional array

If each array has the shape (1000, 2, 100), it is easy to use
con = np.concatenate((array_A, array_B))
to concatenate them, thus con has the shape (2000, 2, 100).
I want to dynamically append or concatenate "con" in a function. The step is described as following:
First, read data from the first file and process data to generate an array.
Secondly, read date from the second file and append generated array into the first array
....
def arrayappend():
for i in range(n):
#read data from file_0 to file_n-1
data = read(file_i)
#data processing to generate an array with shape (1000, 2, 100)
con = function(data)
# append con
Assuming all your files produce the same shape objects and you want to join them on the 1st dimension, there are several options:
alist = []
for f in files:
data = foo(f)
alist.append(f)
arr = np.concatenate(alist, axis=0)
concatenate takes a list. There are variations if you want to add a new axis (np.array(alist), np.stack etc).
Append to a list is fast, since it just means adding a pointer to the data object. concatenate creates a new array from the components; it's compiled but still relatively slower.
If you must/want to make a new array at each stage you could write:
arr = function(files[0])
for f in files[1:]:
data = foo(f)
arr = np.concatenate((arr, data), axis=0)
This probably is slower, though, if the file loading step is slow enough you might not notice a difference.
With care you might be able start with arr = np.zeros((0,2,100)) and read all files in the loop. You have to make sure the initial 'empty' array has a compatible shape. New users often have problems with this.
If you absolutely want to do it during iteration then:
def arrayappend():
con = None
for i, d in enumerate(files_list):
data = function(d)
con = data if i is 0 else np.vstack([con, data])
This should stack it vertically.
Very non pretty, but does it achieve what you want? It is way unoptimized.
def arrayappend():
for i in range(n):
data = read(file_i)
try:
con
con = np.concatenate((con, function(data)))
except NameError:
con = function(data)
return con
First loop will take the except branch, subsequent wont.

How to convert from a dtype('o') to a dtype(float) in numpy?

I'm looking for a way to read this csv into python 2.7 and turn it into a (3,22000) array. For some reason I haven't been able to do it, no matter which way i try, I either get a groupn of strings in an array that i cant convert or an array seen below that won't convert to floats or allow computations to be done on them. Any help would be appreciated. Thanks
For the record it says the shape is (22000,), which I'm unsure about also.
In [126]: import csv
import numpy as np
with open("Data.csv") as sd:
ri = []
dv = []
for row in csv.reader(sd):
if row != ["ccx","ccy","ccz","cellVolumes","Cell Type"]:
nrow = []
for val in row[0:3]:
val = float(val)
nrow.append(val)
ri.append(nrow)
nrow = []
for val in row[3:4]:
val = float(val)
nrow.append(val)
dv.append(nrow)
ri = np.array(ri)
ri
.
Out[126]: array([[-0.179967, -0.38936, -0.46127], [-0.0633236, -0.407683, -0.542979],
[-0.125841, -0.494202, -0.412042], ...,
[-0.0116821, 0.764493, 0.573541], [0.630377, 0.469657, 0.442017],
[0.248253, 0.615365, 0.354134]], dtype=object
(from the helpful comments)
Check the length of those sublists. If they are all the same I'd expect a 2d array; but if they differ (most 3, but some 0, 2,4 etc) then the best it can do is give you a 1d array of 'objects' - the lists.
I would just do [len(x) for x in ri] before passing it to np.array. Maybe apply a max and min. A list comprehension like that won't take long.

Categories

Resources