I have a function, for simplicity I'll call it shuffler and it takes an list, gives random a seed 17 and then prints that list shuffled.
def shuffler( n ):
import random
random.seed( 17 )
print( random.shuffle( n ) )
How would I create another function called unshuffler that "unshuffles" that list that is returned by shuffler(), bringing it back to the list I inputted into shuffler() assuming that I know the seed?
Just wanted to contribute an answer that's more compatible with functional patterns commonly used with numpy. Ultimately this solution should perform the fastest as it will take advantage of numpy's internal optimizations, which themselves can be further optimized via the use of projects like numba. It ought to be much faster than using conventional loop structures in python.
import numpy as np
original_data = np.array([23, 44, 55, 19, 500, 201]) # Some random numbers to represent the original data to be shuffled
data_length = original_data.shape[0]
# Here we create an array of shuffled indices
shuf_order = np.arange(data_length)
np.random.shuffle(shuf_order)
shuffled_data = original_data[shuf_order] # Shuffle the original data
# Create an inverse of the shuffled index array (to reverse the shuffling operation, or to "unshuffle")
unshuf_order = np.zeros_like(shuf_order)
unshuf_order[shuf_order] = np.arange(data_length)
unshuffled_data = shuffled_data[unshuf_order] # Unshuffle the shuffled data
print(f"original_data: {original_data}")
print(f"shuffled_data: {shuffled_data}")
print(f"unshuffled_data: {unshuffled_data}")
assert np.all(np.equal(unshuffled_data, original_data))
Here are two functions that do what you need:
import random
import numpy as np
def shuffle_forward(l):
order = range(len(l)); random.shuffle(order)
return list(np.array(l)[order]), order
def shuffle_backward(l, order):
l_out = [0] * len(l)
for i, j in enumerate(order):
l_out[j] = l[i]
return l_out
Example
l = range(10000); random.shuffle(l)
l_shuf, order = shuffle_forward(l)
l_unshuffled = shuffle_backward(l_shuf, order)
print l == l_unshuffled
#True
Reseed the random generator with the seed in question and then shuffle the list 1, 2, ..., n. This tells you exactly what ended up where in the shuffle.
In Python3:
import random
import numpy as np
def shuffle_forward(l):
order = list(range(len(l)); random.shuffle(order))
return list(np.array(l)[order]), order
def shuffle_backward(l, order):
l_out = [0] * len(l)
for i, j in enumerate(order):
l_out[j] = l[i]
return l_out
Related
import numpy as np
import pandas as pd
import cmath
a = np.array([[complex(3,6),complex(7,9),complex(2,8),complex(6,5)],
[complex(3,7),complex(7,9),complex(2,8),complex(6,5)],
[complex(3,8),complex(7,9),complex(2,8),complex(6,5)],
[complex(3,9),complex(7,9),complex(2,8),complex(6,5)],
[complex(3,1),complex(7,9),complex(2,8),complex(6,5)],
[complex(3,2),complex(7,9),complex(2,8),complex(6,5)],
[complex(3,3),complex(7,9),complex(2,8),complex(6,5)],
[complex(3,4),complex(7,9),complex(2,8),complex(6,5)],
])
l = np.array(['eval1_real','eval2_real','eval3_real','eval4_real','eval1_imag','eval2_imag','eval3_imag','eval4_imag'])
x = 1
for i in range(0, len(a),1):
w = a[i]
e1r = w[0].real
e1c = w[0].imag
e2r = w[1].real
e2c = w[1].imag
e3r = w[2].real
e3c = w[2].imag
e4r = w[3].real
e4c = w[3].imag
p = np.array([e1r, e1c, e2r, e2c, e3r, e3c, e4r, e4c])
m = np.insert(l,x,p,0)
x = x + 1
I tried for loop to separate but i cannot get those number to form together to become a full matrix
Is there a way to separate it altogether without using a loop or some array function i can put those together?
You should learn to use numpy-builtin functions for elemental operations on all elements. You can try,
result = np.dstack(
np.apply_along_axis(
lambda x: [x.real, x.imag], 0, a)
).flatten().reshape(8,8)
numpy.apply_along_axis
numpy.dstack
I would like to apply a function to each of the 3x3 matrices in my (6890,6890,3,3) numpy array. Until now, I have tried using vectorization on a smaller example and with a simpler function which didn't work out.
def myfunc(x):
return np.linalg.norm(x)
m = np.arange(45).reshape(5,3,3)
t = m.shape[0]
r = np.zeros((t, t))
q = m[:,None,...] # m.swapaxes(1,2) # m[i] # m[j].T
f = np.vectorize(q, otypes=[np.float])
res = myfunc(f)
Is vectorization even the right approach to solve this problem efficiently or should I try something else? I've also looked into numpy.apply_along_axis but this only applies to 1D-subarrays.
You need loop over each element and apply function:
import numpy as np
# setup function
def myfunc(x):
return np.linalg.norm(x*2)
# setup data array
data = np.arange(45).reshape(5, 3, 3)
# loop over elements and update
for item in np.nditer(data, op_flags = ['readwrite']):
item[...] = myfunc(item)
If you need apply function for entire 3x3 array then use:
out_data = []
for item in data:
out_data.append(myfunc(item))
Output:
[14.2828568570857, 39.761790704142086, 66.4529909033446, 93.32202312423365, 120.24974012445931]
I want to simply distribute N items in n cells, both numbers N and n can be large, so I wouldn't like to loop over random as here:
import numpy as np
nitems = 100
ncells = 3
cells = np.zeros((ncells), dtype=np.int)
for _ in range(nitems):
dest = np.random.randint(ncells)
cells[dest] += 1
print(cells)
In this case, the output is:
[31 34 35]
(the sum is always N)
Is it there any faster way?
An answer to the question (I have to thank here to #pjs for his help) follows. I think it is the fastest, and probably, the shortest and most space efficient one possible:
from numpy import *
from time import sleep
g_nitems = 10000
g_ncells = 10
g_nsamples = 10000
def genDist(nitems, ncells):
r = sort(random.randint(0, nitems+1, ncells-1))
return concatenate((r,[nitems])) - concatenate(([0],r))
# Some stats
test = zeros(g_ncells, dtype=int)
Max = zeros(g_ncells, dtype=int)
for _ in range(g_nsamples):
tmp = genDist(g_nitems, g_ncells)
print(tmp.sum(), tmp, end='\r')
# print(_, end='\r')
# sleep(0.5)
test += tmp
for i in range(g_ncells):
if tmp[i] > Max[i]:
Max[i] = tmp[i]
print("\n", Max)
print(test//g_nsamples)
On my machine, your code with a timeit took 151 microseconds. The following took 11 microseconds:
import numpy as np
nitems = 100
ncells = 3
values = np.random.randint(0,ncells,nitems)
cells = np.array_split(values,3)
lengths= [ len(cell) for cell in cells ]
print(lengths,np.sum(lengths))
The result of the print is [34, 33, 33] 100.
The magic here is using numpy to do the splitting, but note that this method will split as close to uniform as possible.
If you want the partitioning done randomly:
import numpy as np
nitems = 100
ncells = 3
values = np.random.randint(0,ncells,nitems)
ind_split = [ np.random.randint(0,nitems) ]
ind_split.append(np.random.randint(ind_split[-1],nitems))
cells = np.array_split(values,ind_split)
lengths= [ len(cell) for cell in cells ]
print(lengths,np.sum(lengths))
This takes advantage of numpy.array_split taking indices of where to perform the split as an argument (rather than the number of near-uniform partitions).
You haven't specified that the counts have to have any particular distribution as long as they add up to N, so the following will work as requested:
import numpy as np
nitems = 100
ncells = 3
range_array = [np.random.randint(nitems + 1) for _ in range(ncells - 1)] + [0, nitems]
range_array.sort()
cells = [range_array[i + 1] - range_array[i] for i in range(ncells)]
print(cells)
It generates an ordered set of random values between 0 and nitems, then takes successive differences to generate the desired number of cell counts.
The complexity is O(ncells) rather than O(nitems), so it should be more efficient when there are substantially more items than cells.
I'm rather new to NumPy. Anyone have an idea for making this code, especially the nested loops, more compact/efficient? BTW, dist and data are three-dimensional numpy arrays.
def interpolate_to_distance(self,distance):
interpolated_data=np.ndarray(self.dist.shape[1:])
for j in range(interpolated_data.shape[1]):
for i in range(interpolated_data.shape[0]):
interpolated_data[i,j]=np.interp(
distance,self.dist[:,i,j],self.data[:,i,j])
return(interpolated_data)
Thanks!
Alright, I'll take a swag with this:
def interpolate_to_distance(self, distance):
dshape = self.dist.shape
dist = self.dist.T.reshape(-1, dshape[-1])
data = self.data.T.reshape(-1, dshape[-1])
intdata = np.array([np.interp(distance, di, da)
for di, da in zip(dist, data)])
return intdata.reshape(dshape[0:2]).T
It at least removes one loop (and those nested indices), but it's not much faster than the original, ~20% faster according to %timeit in IPython. On the other hand, there's a lot of (probably unnecessary, ultimately) transposing and reshaping going on.
For the record, I wrapped it up in a dummy class and filled some 3 x 3 x 3 arrays with random numbers to test:
import numpy as np
class TestClass(object):
def interpolate_to_distance(self, distance):
dshape = self.dist.shape
dist = self.dist.T.reshape(-1, dshape[-1])
data = self.data.T.reshape(-1, dshape[-1])
intdata = np.array([np.interp(distance, di, da)
for di, da in zip(dist, data)])
return intdata.reshape(dshape[0:2]).T
def interpolate_to_distance_old(self, distance):
interpolated_data=np.ndarray(self.dist.shape[1:])
for j in range(interpolated_data.shape[1]):
for i in range(interpolated_data.shape[0]):
interpolated_data[i,j]=np.interp(
distance,self.dist[:,i,j],self.data[:,i,j])
return(interpolated_data)
if __name__ == '__main__':
testobj = TestClass()
testobj.dist = np.random.randn(3, 3, 3)
testobj.data = np.random.randn(3, 3, 3)
distance = 0
print 'Old:\n', testobj.interpolate_to_distance_old(distance)
print 'New:\n', testobj.interpolate_to_distance(distance)
Which prints (for my particular set of randoms):
Old:
[[-0.59557042 -0.42706077 0.94629049]
[ 0.55509032 -0.67808257 -0.74214045]
[ 1.03779189 -1.17605275 0.00317679]]
New:
[[-0.59557042 -0.42706077 0.94629049]
[ 0.55509032 -0.67808257 -0.74214045]
[ 1.03779189 -1.17605275 0.00317679]]
I also tried np.vectorize(np.interp) but couldn't get that to work. I suspect that would be much faster if it did work.
I couldn't get np.fromfunction to work either, as it passed (2) 3 x 3 (in this case) arrays of indices to np.interp, the same arrays you get from np.mgrid.
One other note: according the the docs for np.interp,
np.interp does not check that the x-coordinate sequence xp is increasing. If
xp is not increasing, the results are nonsense. A simple check for
increasingness is::
np.all(np.diff(xp) > 0)
Obviously, my random numbers violate the 'always increasing' rule, but you'll have to be more careful.
For example,
The function could be something like def RandABCD(n, .25, .34, .25, .25):
Where n is the length of the string to be generated and the following numbers are the desired probabilities of A, B, C, D.
I would imagine this is quite simple, however i am having trouble creating a working program. Any help would be greatly appreciated.
Here's the code to select a single weighted value. You should be able to take it from here. It uses bisect and random to accomplish the work.
from bisect import bisect
from random import random
def WeightedABCD(*weights):
chars = 'ABCD'
breakpoints = [sum(weights[:x+1]) for x in range(4)]
return chars[bisect(breakpoints, random())]
Call it like this: WeightedABCD(.25, .34, .25, .25).
EDIT: Here is a version that works even if the weights don't add up to 1.0:
from bisect import bisect_left
from random import uniform
def WeightedABCD(*weights):
chars = 'ABCD'
breakpoints = [sum(weights[:x+1]) for x in range(4)]
return chars[bisect_left(breakpoints, uniform(0.0,breakpoints[-1]))]
The random class is quite powerful in python. You can generate a list with the characters desired at the appropriate weights and then use random.choice to obtain a selection.
First, make sure you do an import random.
For example, let's say you wanted a truly random string from A,B,C, or D.
1. Generate a list with the characters
li = ['A','B','C','D']
Then obtain values from it using random.choice
output = "".join([random.choice(li) for i in range(0, n)])
You could easily make that a function with n as a parameter.
In the above case, you have an equal chance of getting A,B,C, or D.
You can use duplicate entries in the list to give characters higher probabilities. So, for example, let's say you wanted a 50% chance of A and 25% chances of B and C respectively. You could have an array like this:
li = ['A','A','B','C']
And so on.
It would not be hard to parameterize the characters coming in with desired weights, to model that I'd use a dictionary.
characterbasis = {'A':25, 'B':25, 'C':25, 'D':25}
Make that the first parameter, and the second being the length of the string and use the above code to generate your string.
For four letters, here's something quick off the top of my head:
from random import random
def randABCD(n, pA, pB, pC, pD):
# assumes pA + pB + pC + pD == 1
cA = pA
cB = cA + pB
cC = cB + pC
def choose():
r = random()
if r < cA:
return 'A'
elif r < cB:
return 'B'
elif r < cC:
return 'C'
else:
return 'D'
return ''.join([choose() for i in xrange(n)])
I have no doubt that this can be made much cleaner/shorter, I'm just in a bit of a hurry right now.
The reason I wouldn't be content with David in Dakota's answer of using a list of duplicate characters is that depending on your probabilities, it may not be possible to create a list with duplicates in the right numbers to simulate the probabilities you want. (Well, I guess it might always be possible, but you might wind up needing a huge list - what if your probabilities were 0.11235442079, 0.4072777384, 0.2297927874, 0.25057505341?)
EDIT: here's a much cleaner generic version that works with any number of letters with any weights:
from bisect import bisect
from random import uniform
def rand_string(n, content):
''' Creates a string of letters (or substrings) chosen independently
with specified probabilities. content is a dictionary mapping
a substring to its "weight" which is proportional to its probability,
and n is the desired number of elements in the string.
This does not assume the sum of the weights is 1.'''
l, cdf = zip(*[(l, w) for l, w in content.iteritems()])
cdf = list(cdf)
for i in xrange(1, len(cdf)):
cdf[i] += cdf[i - 1]
return ''.join([l[bisect(cdf, uniform(0, cdf[-1]))] for i in xrange(n)])
Here is a rough idea of what might suit you
import random as r
def distributed_choice(probs):
r= r.random()
cum = 0.0
for pair in probs:
if (r < cum + pair[1]):
return pair[0]
cum += pair[1]
The parameter probs takes a list of pairs of the form (object, probability). It is assumed that the sum of probabilities is 1 (otherwise, its trivial to normalize).
To use it just execute:
''.join([distributed_choice(probs)]*4)
Hmm, something like:
import random
class RandomDistribution:
def __init__(self, kv):
self.entries = kv.keys()
self.where = []
cnt = 0
for x in self.entries:
self.where.append(cnt)
cnt += kv[x]
self.where.append(cnt)
def find(self, key):
l, r = 0, len(self.where)-1
while l+1 < r:
m = (l+r)/2
if self.where[m] <= key:
l=m
else:
r=m
return self.entries[l]
def randomselect(self):
return self.find(random.random()*self.where[-1])
rd = RandomDistribution( {"foo": 5.5, "bar": 3.14, "baz": 2.8 } )
for x in range(1000):
print rd.randomselect()
should get you most of the way...
Thank you all for your help, I was able to figure something out, mostly with this info.
For my particular need, I did something like this:
import random
#Create a function to randomize a given string
def makerandom(seq):
return ''.join(random.sample(seq, len(seq)))
def randomDNA(n, probA=0.25, probC=0.25, probG=0.25, probT=0.25):
notrandom=''
A=int(n*probA)
C=int(n*probC)
T=int(n*probT)
G=int(n*probG)
#The remainder part here is used to make sure all n are used, as one cannot
#have half an A for example.
remainder=''
for i in range(0, n-(A+G+C+T)):
ramainder+=random.choice("ATGC")
notrandom=notrandom+ 'A'*A+ 'C'*C+ 'G'*G+ 'T'*T + remainder
return makerandom(notrandom)