Computing stats on generators in single pass. Python - python

When working with generators you can only pull out items on a single pass. An alternative is to load the generator into an list and do multiple passes but this involves a hit on performance and memory allocation.
Can anyone think of a better way of computing the following metrics from a generator in a single pass. Ideally the code computes the count, sum, average, sd, max, min and any other stats you can think of.
UPDATE
Initial horrid code in this gist.
See the gist here: https://gist.github.com/3038746
Using the great suggestions from #larsmans here is the final solution I went with. Using the named tuple really helped.
import random
from math import sqrt
from collections import namedtuple
def stat(gen):
"""Returns the namedtuple Stat as below."""
Stat = namedtuple('Stat', 'total, sum, avg, sd, max, min')
it = iter(gen)
x0 = next(it)
mx = mn = s = x0
s2 = x0*x0
n = 1
for x in it:
mx = max(mx, x)
mn = min(mn, x)
s += x
s2 += x*x
n += 1
return Stat(n, s, s/n, sqrt(s2/n - s*s/n/n), mx, mn)
def random_int_list(size=100, start=0, end=1000):
return (random.randrange(start,end,1) for x in xrange(size))
if __name__ == '__main__':
r = stat(random_int_list())
print r #Stat(total=100, sum=56295, avg=562, sd=294.82537204250247, max=994, min=10)

def statistics(it):
"""Returns number of elements, sum, max, min"""
it = iter(it)
x0 = next(it)
maximum = minimum = total = x0
n = 1
for x in it:
maximum = max(maximum, x)
minimum = min(minimum, x)
total += x
n += 1
return n, total, maximum, minimum
Add other statistics as you please. Consider using a namedtuple when the number of statistics to compute grows large.
If you want to get really fancy, you can build an OO hierarchy of statistics collectors (untested):
class Summer(object):
def __init__(self, x0=0):
self.value = x0
def add(self, x):
self.value += x
class SquareSummer(Summer):
def add(self, x):
super(SquareSummer, self).add(x ** 2)
class Maxer(object):
def __init__(self, x0):
self.value = x0
def add(self, x):
self.value = max(self.value, x)
# example usage: collect([Maxer, Summer], iterable)
def collect(collectors, it):
it = iter(it)
x0 = next(it)
collectors = [c(x0) for c in collectors]
for x in it:
for c in collectors:
c.add(x)
return [c.value for c in collectors]

Related

Loop a function using the previous output as input

When my function foo generating a new element, I want to reuse the output and put it in foo n-times. How can I do it?
My function:
def foo(x):
return x + 3
print(foo(1))
>>>4
For now. I'm using this method:
print(foo(foo(foo(1))))
There are a couple ways to do what you want. First is recursion, but this involves changing foo() a bit, like so:
def foo(x, depth):
if depth <= 0:
return x
return foo(x+3, depth-1)
and you'd call it like foo(1, n)
The other way is with a loop and temp variable, like so
val = 1
for _ in range(0, n):
val = foo(val)
Use a loop for this:
value = 1
for i in range(10):
value = foo(value)
def foo(x,y):
for i in range(y):
x = x + 3
return x
print (foo(10,3))
Output:
19
What you are searching for is called recursion:
def foo(x, n=1):
if n == 0:
return x
return foo(x + 3, n - 1)
Another possible with lambda and reduce
Reduce function
from functools import reduce
def foo(x):
return x + 3
print(reduce(lambda y, _: foo(y), range(3), 1))
You will get 10 as result
# y = assigned return value of foo.
# _ = is the list of numbers from range(3) for reduce to work
# 3 = n times
# 1 = param for x in foo

Evaluating Polynomial coefficients

I'm trying to write a function that takes as input a list of coefficients (a0, a1, a2, a3.....a n) of a polynomial p(x) and the value x. The function will return p(x), which is the value of the polynomial when evaluated at x.
A polynomial of degree n with coefficient a0, a1, a2, a3........an is the function
p(x)= a0+a1*x+a2*x^2+a3*x^3+.....+an*x^n
So I'm not sure how to attack the problem. I'm thinking that I will need a range but how can I make it so that it can handle any numerical input for x? I'm not expecting you guys to give the answer, I'm just in need of a little kick start. Do I need a for loop, while loop or could recursive be an option here?
def poly(lst, x)
I need to iterate over the items in the list, do I use the indices for that, but how can I make it iterate over an unknown number of items?
I'm thinking I can use recursion here:
def poly(lst, x):
n = len(lst)
If n==4:
return lst[o]+lst[1]*x+lst[2]*x**2+lst[3]*x**3
elif n==3:
return lst[o]+lst[1]*x+lst[2]*x**2
elif n==2:
return lst[o]+lst[1]*x
elif n==1:
return lst[o]
else:
return lst[o]+lst[1]*x+lst[2]*x**2+lst[3]*x**3+lst[n]*x**n
This works for n<=4 but I get a index error: list index out of range for n>4, can't see why though.
The most efficient way is to evaluate the polynomial backwards using Horner's Rule. Very easy to do in Python:
# Evaluate a polynomial in reverse order using Horner's Rule,
# for example: a3*x^3+a2*x^2+a1*x+a0 = ((a3*x+a2)x+a1)x+a0
def poly(lst, x):
total = 0
for a in reversed(lst):
total = total*x+a
return total
simple:
def poly(lst, x):
n, tmp = 0, 0
for a in lst:
tmp = tmp + (a * (x**n))
n += 1
return tmp
print poly([1,2,3], 2)
simple recursion:
def poly(lst, x, i = 0):
try:
tmp = lst.pop(0)
except IndexError:
return 0
return tmp * (x ** (i)) + poly(lst, x, i+1)
print poly([1,2,3], 2)
def evalPoly(lst, x):
total = 0
for power, coeff in enumerate(lst): # starts at 0 by default
total += (x**power) * coeff
return total
Alternatively, you can use a list and then use sum:
def evalPoly(lst, x):
total = []
for power, coeff in enumerate(lst):
total.append((x**power) * coeff)
return sum(total)
Without enumerate:
def evalPoly(lst, x):
total, power = 0, 0
for coeff in lst:
total += (x**power) * coeff
power += 1
return total
Alternative to non-enumerate method:
def evalPoly(lst, x):
total = 0
for power in range(len(lst)):
total += (x**power) * lst[power] # lst[power] is the coefficient
return total
Also #DSM stated, you can put this together in a single line:
def evalPoly(lst, x):
return sum((x**power) * coeff for power, coeff in enumerate(lst))
Or, using lambda:
evalPoly = lambda lst, x: sum((x**power) * coeff for power, coeff in enumerate(lst))
Recursive solution:
def evalPoly(lst, x, power = 0):
if power == len(lst): return (x**power) * lst[power]
return ((x**power) * lst[power]) + evalPoly(lst, x, power + 1)
enumerate(iterable, start) is a generator expression (so it uses yield instead of return that yields a number and then an element of the iterable. The number is equivalent to the index of the element + start.
From the Python docs, it is also the same as:
def enumerate(sequence, start=0):
n = start
for elem in sequence:
yield n, elem
n += 1
Either with recursion, or without, the essence of the solution is to create a loop on "n", because the polynomial starts at x^0 and goes up to a_n.x^n and that's the variable you should also consider as an input. Besides that, use a trick called multiply and accumulate to be able to calculate partial results on each loop iteration.
def evalPoly(lst, x, power):
if power == 0:
return lst[power]
return ((x**power) * lst[power]) + evalPoly(lst, x, power - 1)
lst = [7, 1, 2, 3]
x = 5
print(evalPoly(lst, x, 3))
Equation to evaluate is - 3x^3 + 2x^2 + x + 7
when x = 5, result is - 437

Pseudorandom number generator for hashing that can take in any size table

I am constructing a pseudo random number generator for hashing. The algorithm I need to use is as follows:
Initialize an integer R to be equal to 1 every time the tabling routine is called
On each successive call for a random number, set R = R*5
Mask all but the lower order n+2 bits of the product and place the result in R
Set P = R/4 and return
This is what I have so far which works for a table of size 2^n, but how can I change it so it can take in a table of any size?
def rand(size,i)
n = math.log(size,2)
r = 1
random_list = []
mask = (1 << 2+int(n)) - 1
for n in range(1,size+1):
r = r*5
r &= mask
p = r/4
random_list = random_list + [p]
if i == 0: return random_list
else: return random_list[i-1]
I didn't really understand how your code related to your algorithm (what is random_list?) or how the code should be structured, but I assume it is something similar to this:
class Rand:
def __init__(self, n):
# Initialize an integer R to be equal to 1 every time the tabling routine is called
self.r = 1
self.n = n
def rand(self):
# On each successive call for a random number, set R = R*5
self.r *= 5
# Mask all but the lower order n+2 bits of the product and place the result in R
self.r = self.r & (pow(2, self.n)-1)
# Set P = R/4 and return
self.p = self.r/4
return self.p
In which case, to make it work with a table of any size, the class becomes this:
class Rand2:
def __init__(self, tableSize):
# Initialize an integer R to be equal to 1 every time the tabling routine is called
self.r = 1
self.tableSize = tableSize
def rand(self):
# On each successive call for a random number, set R = R*5
self.r *= 5
# A bit mask is essentially a modulus operation, which is what we do instead
self.r = self.r % self.tableSize
# Set P = R/4 and return
self.p = self.r/4
return self.p
A simple test proves the outcome to be the same when the table sizes are identical:
rnd = Rand(10)
for i in range(0, 10):
print rnd.rand()
rnd2 = Rand2(pow(2, 10))
for i in range(0, 10):
print rnd2.rand()
But, like I said, I didn't really understand how your code related to your algorithm. I guess the tl;dr here is use the modulus operator instead of a bit mask.

Quickly Find the Index in an Array Closest to Some Value

I have an array of values, t, that is always in increasing order (but not always uniformly spaced). I have another single value, x. I need to find the index in t such that t[index] is closest to x. The function must return zero for x < t.min() and the max index (or -1) for x > t.max().
I've written two functions to do this. The first one, f1, is MUCH quicker in this simple timing test. But I like how the second one is just one line. This calculation will be done on a large array, potentially many times per second.
Can anyone come up with some other function with comparable timing to the first but with cleaner looking code? How about something quicker then the first (speed is most important)?
Thanks!
Code:
import numpy as np
import timeit
t = np.arange(10,100000) # Not always uniform, but in increasing order
x = np.random.uniform(10,100000) # Some value to find within t
def f1(t, x):
ind = np.searchsorted(t, x) # Get index to preserve order
ind = min(len(t)-1, ind) # In case x > max(t)
ind = max(1, ind) # In case x < min(t)
if x < (t[ind-1] + t[ind]) / 2.0: # Closer to the smaller number
ind = ind-1
return ind
def f2(t, x):
return np.abs(t-x).argmin()
print t, '\n', x, '\n'
print f1(t, x), '\n', f2(t, x), '\n'
print t[f1(t, x)], '\n', t[f2(t, x)], '\n'
runs = 1000
time = timeit.Timer('f1(t, x)', 'from __main__ import f1, t, x')
print round(time.timeit(runs), 6)
time = timeit.Timer('f2(t, x)', 'from __main__ import f2, t, x')
print round(time.timeit(runs), 6)
This seems much quicker (for me, Python 3.2-win32, numpy 1.6.0):
from bisect import bisect_left
def f3(t, x):
i = bisect_left(t, x)
if t[i] - x > 0.5:
i-=1
return i
Output:
[ 10 11 12 ..., 99997 99998 99999]
37854.22200356027
37844
37844
37844
37854
37854
37854
f1 0.332725
f2 1.387974
f3 0.085864
np.searchsorted is binary search (split the array in half each time). So you have to implement it in a way it return the last value smaller than x instead of returning zero.
Look at this algorithm (from here):
def binary_search(a, x):
lo=0
hi = len(a)
while lo < hi:
mid = (lo+hi)//2
midval = a[mid]
if midval < x:
lo = mid+1
elif midval > x:
hi = mid
else:
return mid
return lo-1 if lo > 0 else 0
just replaced the last line (was return -1). Also changed the arguments.
As the loops are written in Python, it may be slower than the first one... (Not benchmarked)
Use searchsorted:
t = np.arange(10,100000) # Not always uniform, but in increasing order
x = np.random.uniform(10,100000)
print t.searchsorted(x)
Edit:
Ah yes, I see that's what you do in f1. Maybe f3 below is easier to read than f1.
def f3(t, x):
ind = t.searchsorted(x)
if ind == len(t):
return ind - 1 # x > max(t)
elif ind == 0:
return 0
before = ind-1
if x-t[before] < t[ind]-x:
ind -= 1
return ind

hash functions family generator in python

I am looking for a hash functions family generator that could generate a family of hash functions given a set of parameters. I haven't found any such generator so far.
Is there a way to do that with the hashlib package ?
For example I'd like to do something like :
h1 = hash_function(1)
h2 = hash_function(2)
...
and h1 and h2 would be different hash functions.
For those of you who might know about it, I am trying to implement a min-hashing algorithm on a very large dataset.
Basically, I have a very large set of features (100 millions to 1 billion) for a given document, and I need to create 1000 to 10000 different random permutations for this set of features.
I do NOT want to build the random permutations explicitly so the technique I would like to use in the following :
generate a hash function h and consider that for two indices r and s
r appears before s in the permutation if h(r) < h(s) and do that for 100 to 1000 different hash functions.
Are there any known libraries that I might have missed ? Or any standard way of generating families of hash functions with python that you might be aware of ?
I'd just do something like (if you don't need thread-safety -- not hard to alter if you DO need thread safety -- and assuming a 32-bit Python version):
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
As mentioned above, you can use universal hashing for minhash.
For example:
import random
def minhash():
d1 = set(random.randint(0, 2000) for _ in range(1000))
d2 = set(random.randint(0, 2000) for _ in range(1000))
jacc_sim = len(d1.intersection(d2)) / len(d1.union(d2))
print("jaccard similarity: {}".format(jacc_sim))
N_HASHES = 200
hash_funcs = []
for i in range(N_HASHES):
hash_funcs.append(universal_hashing())
m1 = [min([h(e) for e in d1]) for h in hash_funcs]
m2 = [min([h(e) for e in d2]) for h in hash_funcs]
minhash_sim = sum(int(m1[i] == m2[i]) for i in range(N_HASHES)) / N_HASHES
print("min-hash similarity: {}".format(minhash_sim))
def universal_hashing():
def rand_prime():
while True:
p = random.randrange(2 ** 32, 2 ** 34, 2)
if all(p % n != 0 for n in range(3, int((p ** 0.5) + 1), 2)):
return p
m = 2 ** 32 - 1
p = rand_prime()
a = random.randint(0, p)
if a % 2 == 0:
a += 1
b = random.randint(0, p)
def h(x):
return ((a * x + b) % p) % m
return h
Reference
#alex's answer is great and concise, but the hash functions it generates are not "very different from each other".
Let's look at the Pearson correlation between 10000 samples of 10000 hashes that put the results in 100 bins
%%time # 1min 14s
n=10000
hashes = [hash_function(i) for i in range(n)]
median_pvalue(hashes, n=n)
# 1.1614081043690444e-06
I.e. the median p_value is 1e-06 which is far from random. Here's an example if it were truly random :
%%time # 4min 15s
hashes = [lambda _ : random.randint(0,100) for _ in range(n)]
median_pvalue(hashes, n=n)
# 0.4979718236429698
Using Carter and Wegman method you could get:
%%time # 1min 43s
hashes = HashFamily(100).draw_hashes(n)
median_pvalue(hashes, n=n)
# 0.841929288037321
Code to reproduce :
from scipy.stats.stats import pearsonr
import numpy as np
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
class HashFamily():
r"""Universal hash family as proposed by Carter and Wegman.
.. math::
\begin{array}{ll}
h_{{a,b}}(x)=((ax+b)~{\bmod ~}p)~{\bmod ~}m \ \mid p > m\\
\end{array}
Args:
bins (int): Number of bins to hash to. Better if a prime number.
moduler (int,optional): Temporary hashing. Has to be a prime number.
"""
def __init__(self, bins, moduler=None):
if moduler and moduler <= bins:
raise ValueError("p (moduler) should be >> m (buckets)")
self.bins = bins
self.moduler = moduler if moduler else self._next_prime(np.random.randint(self.bins + 1, 2**32))
# do not allow same a and b, as it could mean shifted hashes
self.sampled_a = set()
self.sampled_b = set()
def _is_prime(self, x):
"""Naive is prime test."""
for i in range(2, int(np.sqrt(x))):
if x % i == 0:
return False
return True
def _next_prime(self, n):
"""Naively gets the next prime larger than n."""
while not self._is_prime(n):
n += 1
return n
def draw_hash(self, a=None, b=None):
"""Draws a single hash function from the family."""
if a is None:
while a is None or a in self.sampled_a:
a = np.random.randint(1, self.moduler - 1)
assert len(self.sampled_a) < self.moduler - 2, "please give a bigger moduler"
self.sampled_a.add(a)
if b is None:
while b is None or b in self.sampled_b:
b = np.random.randint(0, self.moduler - 1)
assert len(self.sampled_b) < self.moduler - 1, "please give a bigger moduler"
self.sampled_b.add(b)
return lambda x: ((a * x + b) % self.moduler) % self.bins
def draw_hashes(self, n, **kwargs):
"""Draws n hash function from the family."""
return [self.draw_hash() for i in range(n)]
def median_pvalue(hashes, buckets=100, n=1000):
p_values = []
for j in range(n-1):
a = [hashes[j](i) % buckets for i in range(n)]
b = [hashes[j+1](i) % buckets for i in range(n)]
p_values.append(pearsonr(a,b)[1])
return np.median(p_values)
Note that my implementation is of Carter and Wegman is very naive (e.g. generation of prime numbers). It could be made shorter and quicker.
You should consider using universal hashing. My answer and code can be found here: https://stackoverflow.com/a/25104050/207661
The universal hash family is a set of hash functions H of size m, such that any two (district) inputs collide with probability at most 1/m when the hash function h is drawn randomly from set H.
Based on the formulation in Wikipedia, use can use the following code:
import random
def is_prime(n):
if n==2 or n==3: return True
if n%2==0 or n<2: return False
for i in range(3, int(n**0.5)+1, 2):
if n%i==0:
return False
return True
# universal hash functions
class UniversalHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = min_value_for_prime_number
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p))
def __call__(self, function_index, input_integer):
a, b, p = self.hash_function_attrs[function_index]
return (((a*input_integer + b)%p)%self.number_of_buckets) + self.bucket_value_offset
Example usage:
We can create a hash family consists of 20 hash functions, each one map the input to 100 buckets.
hash_family = UniversalHashFamily(20, 100)
And get the hashed values like:
input_integer = 1234567890 # sample input
hash_family(0, input_integer) # the output of the first hash function, i.e. h0(input_integer)
hash_family(1, input_integer) # the output of the second hash function, i.e. h1(input_integer)
# ...
hash_family(19, input_integer) # the output of the last hash function, i.e. h19(input_integer)
If you are interested in the universal hash family for string inputs, you can use the following code. But please note that this code may not be the optimized solution for string hashing.
class UniversalStringHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = max(min_value_for_prime_number, number_of_buckets)
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
a2 = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p, a2))
def hash_int(self, int_to_hash, a, b, p):
return (((a*int_to_hash + b)%p)%self.number_of_buckets) + self.bucket_value_offset
def hash_str(self, str_to_hash, a, b, p, a2):
str_to_hash = "1" + str_to_hash # this will ensure that universality is not affected, see wikipedia for more detail
l = len(str_to_hash)-1
int_to_hash = 0
for i in range(l+1):
int_to_hash += ord(str_to_hash[i]) * (a2 ** (l-i))
int_to_hash = int_to_hash % p
return self.hash_int(int_to_hash, a, b, p)
def __call__(self, function_index, str_to_hash):
a, b, p, a2 = self.hash_function_attrs[function_index]
return self.hash_str(str_to_hash, a, b, p, a2)

Categories

Resources