Optimization of average calculation from a list of dictionaries - python

I have a list of dictionaries, with keys 'a', 'n', 'o', 'u'.
Is there a way to speed up this calculation, for instance with NumPy? There are tens of thousands of items in the list.
The data is drawn from a database, so I must live with that it's in the form of a list of dictionaries originally.
x = n = o = u = 0
for entry in indata:
x += (entry['a']) * entry['n'] # n - number of data points
n += entry['n']
o += entry['o']
u += entry['u']
loops += 1
average = int(round(x / n)), n, o, u

I doubt this will be much faster, but I suppose it's a candidate for timeit...
from operator import itemgetter
x = n = o = u = 0
items = itemgetter('a','n','o','u')
for entry in indata:
A,N,O,U = items(entry)
x += A*N # n - number of data points
n += N
o += O #don't know what you're doing with O or U, but I'll leave them
u += U
average = int(round(x / n)), n, o, u
At the very least, it saves a lookup of entry['n'] since I've now saved it to a variable

You could try something like this:
mean_a = np.sum(np.array([d['a'] for d in data]) * np.array([d['n'] for d in data])) / len(data)
EDIT: Actually, the method above from #mgilson is faster:
import numpy as np
from operator import itemgetter
from pandas import *
data=[]
for i in range(100000):
data.append({'a':np.random.random(), 'n':np.random.random(), 'o':np.random.random(), 'u':np.random.random()})
def func1(data):
x = n = o = u = 0
items = itemgetter('a','n','o','u')
for entry in data:
A,N,O,U = items(entry)
x += A*N # n - number of data points
n += N
o += O #don't know what you're doing with O or U, but I'll leave them
u += U
average = int(round(x / n)), n, o, u
return average
def func2(data):
mean_a = np.sum(np.array([d['a'] for d in data]) * np.array([d['n'] for d in data])/len(data)
return (mean_a,
np.sum([d['n'] for d in data]),
np.sum([d['o'] for d in data]),
np.sum([d['u'] for d in data])
)
def func3(data):
dframe = DataFrame(data)
return np.sum((dframe["a"]*dframe["n"])) / dframe.shape[0], np.sum(dframe["n"]), np.sum(dframe["o"]), np.sum(dframe["u"])
In [3]: %timeit func1(data)
10 loops, best of 3: 59.6 ms per loop
In [4]: %timeit func2(data)
10 loops, best of 3: 138 ms per loop
In [5]: %timeit func3(data)
10 loops, best of 3: 129 ms per loop
If you are doing other operations on the data, I would definitely look into using the Pandas package. It's DataFrame object is a nice match to the list of dictionaries that you are working with. I think that the majority of the overhead is IO operations of getting the data into numpy arrays or DataFrame objects.

if all you're looking to do is get an average value on something why not
sum_for_average = math.fsum(your_item)
average_of_list = sum_for_average / len(your_item)
no mucking about with numpy at all.

Related

Find the number in a given range so that the gcd of the number with any element of a given list will always be 1

Given a number M and a list A which contains N elements (A1, A2,...)
Find the all the numbers k so that:
1=<k=<M which satisfied gcd(Ai, k) is always equal to 1
Here's my code, the only problem for it is that it uses loops in each other, which slow the process if my inputs are big, how can I fix it so that it requires less time?
N, M = [int(v) for v in input().split()]
A = [int(v) for v in input().split()]
from math import gcd
cnt = 0
print(N)
for k in range(1, M+1):
for i in range(N):
if gcd(k, A[i]) == 1:
cnt += 1
if cnt == N:
print(k)
cnt = 0
inputs example: (first line contains N and M, second contains the list A1, A2,...)
3 12
6 1 5
Here's a fast version that eliminates the nested loops:
N, M = [int(v) for v in input().split()]
A = [int(v) for v in input().split()]
from math import gcd
print(N)
l = 1
for v in A:
l = l*v//gcd(l, v)
for k in range(1, M+1):
if gcd(l, k) == 1:
print(k)
It works by first taking the LCM, l, of the values in A. It then suffices to check if the GCD of k and l is 1, which means there are no common factors with any of the values in A.
Note: If you're using a newer version of Python than I am (3.9 or later), you can import lcm from math and replace l = l*v//gcd(l, v) with l = lcm(l, v).
Or, as Kelly Bundy pointed out, lcm accepts an arbitrary number of arguments, so the first loop can be replaced with l = lcm(*A) if you're using 3.9 or later.
Just another approach using sympy.theory, factorint and Python sets which from the point of view of speed has on my machine no advantage compared to the math.lcm() or the math.gcd() based solutions if applied to small sizes of lists and numbers, but excels at very large size of randomized list:
M = 12
lstA = (6, 1, 5)
from sympy.ntheory import factorint
lstAfactors = []
for a in lstA:
lstAfactors += factorint(a)
setA = set(lstAfactors)
for k in range(1, M+1):
if not (set(factorint(k)) & setA):
print(k)
The code above implements the idea described in the answer of Yatisi coded by Tom Karzes using math.gcd(), but is using sympy.ntheory factorint() and set() instead of math gcd().
In terms of speed the factorint() solution seems to be fastest on the below tested data:
# ======================================================================
from time import perf_counter as T
from math import gcd, lcm
from sympy import factorint
from random import choice
#M = 3000
#lstA = 100 * [6, 12, 18, 121, 256, 1024, 361, 2123, 39]
M = 8000
lstA = [ choice(range(1,8000)) for _ in range(8000) ]
# ----------------------------------------------------------------------
from sympy.ntheory import factorint
lstResults = []
lstAfactors = []
sT=T()
for a in lstA:
lstAfactors += factorint(a)
setA = set(lstAfactors)
for k in range(1, M+1):
if not (set(factorint(k)) & setA):
lstResults += [k]
print("factorint:", T()-sT)
#print(lstResults)
print("---")
# ----------------------------------------------------------------------
lstResults = []
sT=T()
#l = 1
#for a in lstA:
# l = (l*a)//gcd(l, a) # can be replaced by:
l = lcm(*lstA) # least common multiple divisible by all lstA items
# ^-- which runs MAYBE a bit faster than the loop with gcd()
for k in range(1, M+1):
if gcd(l, k) == 1:
lstResults += [k]
print("lcm() :", T()-sT)
#print(lstResults)
print("---")
# ----------------------------------------------------------------------
lstResults = []
sT=T()
l = 1
for a in lstA:
l = (l*a)//gcd(l, a) # can be replaced by:
#l = lcm(*lstA) # least common multiple divisible by all lstA items
# ^-- which runs MAYBE a bit faster than the loop with gcd()
for k in range(1, M+1):
if gcd(l, k) == 1:
lstResults += [k]
print("gcd() :", T()-sT)
#print(lstResults)
print("---")
# ----------------------------------------------------------------------
import numpy as np
A = np.array(lstA)
def find_gcd_np(M, A, to_gcd=1):
vals = np.arange(1, M + 1)
return vals[np.all(np.gcd(vals, np.array(A)[:, None]) == to_gcd, axis=0)]
sT=T()
lstResults = find_gcd_np(M, A, 1).tolist()
print("numpy :", T()-sT)
#print(lstResults)
print("---")
printing
factorint: 0.09754624799825251
---
lcm() : 0.10102138598449528
---
gcd() : 0.10236155497841537
---
numpy : 6.923375226906501
---
The timing results change extremely for the second data variant in the code provided above printing:
factorint: 0.021642255946062505
---
lcm() : 0.0010238440008834004
---
gcd() : 0.0013772319070994854
---
numpy : 0.19953695288859308
---
where the factorint based approach is 20x and the numpy based approach 200x times slower than the gcd/lcm based one.
Run the timing test yourself online. It won't run the case of large data, but it can at least demonstrate that the numpy approach is 100x times slower than the gcd one:
factorint: 0.03271647123619914
---
lcm() : 0.003286922350525856
---
gcd() : 0.0029655308462679386
---
numpy : 0.41759901121258736
1 https://ato.pxeger.com/run?1=3VXBitswED0W9BXDGoq16-zaSbtsAzmk0GN6KLmUkAbFkdcismQkZbc-9Et62Uv7Uf2ajiwnNt1Du7ClUIORpXmaefNmLH39Xjeu1Orh4dvBFaObHy--RDB7locURlfgRMUBQFS1Ng5qbopNrg_KcQPMwjKAKubKHnSb7xKQeRVstqnq5mQrWO60EcoFo2Fqh0NnzEstck6iBfqCGUzSNCWRtG6OkyxN4RxW1wlkY3xv_JglMH7tV9LxqwQm136ejSf4-WZNhk46H6suQoxhb3mcJTdpSikU2sAGhIKw7BdhTUgEo2d5BjJcKldybZrHaiDDD9wepLOe9GrdUg5mGxbscraMKfFkmSfrAVOCOQ6RF7PeZ8wosbxNHId4AAte9n3KKNziIqOtOxAFKO0g9pt6Z3sU6qV3NO9gEEIfWWPk1X5N6hZ8dto3PUsAaY_skpIoGPtN9AhHlc7oMyo-4DXULpK-kUj0i4ZRmwqaYnnO6NUV9m8sE2AUIsiZgi0Hw2vJcr6DbTMF4rHY3_G53-9RkjOL7aurSiuoMK6oJYeduBNWbPFr2wCTsg0HwvHKYsxPoxHclyIvwRyUhcX849t3yGorfFtY_3-5EoNjw4DUuoZ7gf-Yp_bb6nX89xQPAsj-oFo-F-oRT6nW3y5WqNXjdn9SpaL_rVSt139Wqu7YUgd_pOPxr2rijxdVXzJjWNOeMZTseAGFULsNkt2oOl4kME_AaT-fHZO_Y9Iet56kgQvIaGs23B2MalErj5EyxsFn75eSPuScrqYJvNeKr1sRQxjsic_CzlLat9Owyx6zy-il01JYF5-0C1k-UepwC3eX8fFS_gk)
This is probably more a math question than a programming question, however, here comes my take: Depending on M and A, it might be better to
Find the prime divisors of the Ai (have a look at this) and put them in a set.
Either remove (sieve) all multiples of these primes from list(range(1,M+1)), which you can do (more) efficiently by smart ordering, or find all primes smaller or equal to M (which could even be pre-computed) that are not divisors of any Ai and compute all multiples up to M.
Explanation: Since gcd(Ai,k)=1 if and only if Ai and k have no common divisors, they also have no prime divisors. Thus, we can first find all prime divisors of the Ai and then make sure our k don't have any of them as divisors, too.
Using numpy with vectorised operations will be a good alternative when your input range M goes up to hundreds and higher and A is stably small (is about as your current A):
import numpy as np
def find_gcd_np(M, A, to_gcd=1):
vals = np.arange(1, M + 1)
return vals[np.all(np.gcd(vals, np.array(A)[:, None]) == to_gcd, axis=0)]
Usage:
print(find_gcd_np(100, [6, 1, 5], 1))

what is the most efficient way of concat two numbers to one number in python?

what is the most efficient way of concat two numbers to one number in python?
numbers are always in between 0 to 255, i have tested few ways by Concat as string and cast back to int but they are very costly in time vice for my code.
example
a = 152
c = 255
d = concat(a,c)
answer:
d = 152255
If the numbers are bounded, just multiply and add:
>>> a = 152
>>> c = 255
>>> d = a*1000+c
>>> d
152255
>>>
This is pretty fast:
def concat(a, b):
return 10**int(log(b, 10)+1)*a+b
It uses the logarithm to find how many times the first number must be multiplied by 10 for the sum to work as a concatenation
In [1]: from math import log
In [2]: a = 152
In [3]: b = 255
In [4]: def concat(a, b):
...: return 10**int(log(b, 10)+1)*a+b
...:
In [5]: concat(a, b)
Out[5]: 152255
In [6]: %timeit concat(a, b)
1000000 loops, best of 3: 1.18 us per loop
Yeah, there you go:
a = 152
b = 255
def concat(a, b):
n = next(x for x in range(10) if 10**x>a) # concatenates numbers up to 10**10
return a * 10**n + b
print(concat(a, b)) # -> 152255

Delete columns of matrix of CSR format in Python

I have a sparse matrix (22000x97482) in csr format and i want to delete some columns (indices of columns numbers are stored in a list)
If you have a very large number of columns then generating the full set of column indices can become rather costly. One slightly faster alternative would be to temporarily convert to COO format:
import numpy as np
from scipy import sparse
def dropcols_fancy(M, idx_to_drop):
idx_to_drop = np.unique(idx_to_drop)
keep = ~np.in1d(np.arange(M.shape[1]), idx_to_drop, assume_unique=True)
return M[:, np.where(keep)[0]]
def dropcols_coo(M, idx_to_drop):
idx_to_drop = np.unique(idx_to_drop)
C = M.tocoo()
keep = ~np.in1d(C.col, idx_to_drop)
C.data, C.row, C.col = C.data[keep], C.row[keep], C.col[keep]
C.col -= idx_to_drop.searchsorted(C.col) # decrement column indices
C._shape = (C.shape[0], C.shape[1] - len(idx_to_drop))
return C.tocsr()
Check equivalence:
m, n, d = 1000, 2000, 20
M = sparse.rand(m, n, format='csr')
idx_to_drop = np.random.randint(0, n, d)
M_drop1 = dropcols_fancy(M, idx_to_drop)
M_drop2 = dropcols_coo(M, idx_to_drop)
print(np.all(M_drop1.A == M_drop2.A))
# True
Benchmark:
In [1]: m, n = 1000, 1000000
In [2]: %%timeit M = sparse.rand(m, n, format='csr')
...: dropcols_fancy(M, idx_to_drop)
...:
1 loops, best of 3: 1.11 s per loop
In [3]: %%timeit M = sparse.rand(m, n, format='csr')
...: dropcols_coo(M, idx_to_drop)
...:
1 loops, best of 3: 365 ms per loop
You can use fancy indexing to obtain a new csr_matrix with the columns that you have in your list:
all_cols = np.arange(old_m.shape[1])
cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, cols_to_delete)))[0]
m = old_m[:, cols_to_keep]

What is the quickest way to get a number with unique digits in python?

Lemme clarify:
What would be the fastest way to get every number with all unique digits between two numbers. For example, 10,000 and 100,000.
Some obvious ones would be 12,345 or 23,456. I'm trying to find a way to gather all of them.
for i in xrange(LOW, HIGH):
str_i = str(i)
...?
Use itertools.permutations:
from itertools import permutations
result = [
a * 10000 + b * 1000 + c * 100 + d * 10 + e
for a, b, c, d, e in permutations(range(10), 5)
if a != 0
]
I used the fact, that:
numbers between 10000 and 100000 have either 5 or 6 digits, but only 6-digit number here does not have unique digits,
itertools.permutations creates all combinations, with all orderings (so both 12345 and 54321 will appear in the result), with given length,
you can do permutations directly on sequence of integers (so no overhead for converting the types),
EDIT:
Thanks for accepting my answer, but here is the data for the others, comparing mentioned results:
>>> from timeit import timeit
>>> stmt1 = '''
a = []
for i in xrange(10000, 100000):
s = str(i)
if len(set(s)) == len(s):
a.append(s)
'''
>>> stmt2 = '''
result = [
int(''.join(digits))
for digits in permutations('0123456789', 5)
if digits[0] != '0'
]
'''
>>> setup2 = 'from itertools import permutations'
>>> stmt3 = '''
result = [
x for x in xrange(10000, 100000)
if len(set(str(x))) == len(str(x))
]
'''
>>> stmt4 = '''
result = [
a * 10000 + b * 1000 + c * 100 + d * 10 + e
for a, b, c, d, e in permutations(range(10), 5)
if a != 0
]
'''
>>> setup4 = setup2
>>> timeit(stmt1, number=100)
7.955858945846558
>>> timeit(stmt2, setup2, number=100)
1.879319190979004
>>> timeit(stmt3, number=100)
8.599710941314697
>>> timeit(stmt4, setup4, number=100)
0.7493319511413574
So, to sum up:
solution no. 1 took 7.96 s,
solution no. 2 (my original solution) took 1.88 s,
solution no. 3 took 8.6 s,
solution no. 4 (my updated solution) took 0.75 s,
Last solution looks around 10x faster than solutions proposed by others.
Note: My solution has some imports that I did not measure. I assumed your imports will happen once, and code will be executed multiple times. If it is not the case, please adapt the tests to your needs.
EDIT #2: I have added another solution, as operating on strings is not even necessary - it can be achieved by having permutations of real integers. I bet this can be speed up even more.
Cheap way to do this:
for i in xrange(LOW, HIGH):
s = str(i)
if len(set(s)) == len(s):
# number has unique digits
This uses a set to collect the unique digits, then checks to see that there are as many unique digits as digits in total.
List comprehension will work a treat here (logic stolen from nneonneo):
[x for x in xrange(LOW,HIGH) if len(set(str(x)))==len(str(x))]
And a timeit for those who are curious:
> python -m timeit '[x for x in xrange(10000,100000) if len(set(str(x)))==len(str(x))]'
10 loops, best of 3: 101 msec per loop
Here is an answer from scratch:
def permute(L, max_len):
allowed = L[:]
results, seq = [], range(max_len)
def helper(d):
if d==0:
results.append(''.join(seq))
else:
for i in xrange(len(L)):
if allowed[i]:
allowed[i]=False
seq[d-1]=L[i]
helper(d-1)
allowed[i]=True
helper(max_len)
return results
A = permute(list("1234567890"), 5)
print A
print len(A)
print all(map(lambda a: len(set(a))==len(a), A))
It perhaps could be further optimized by using an interval representation of the allowed elements, although for n=10, I'm not sure it will make a difference. I could also transform the recursion into a loop, but in this form it is more elegant and clear.
Edit: Here are the timings of the various solutions
2.75808000565 (My solution)
8.22729802132 (Sol 1)
1.97218298912 (Sol 2)
9.659760952 (Sol 3)
0.841020822525 (Sol 4)
no_list=['115432', '555555', '1234567', '5467899', '3456789', '987654', '444444']
rep_list=[]
nonrep_list=[]
for no in no_list:
u=[]
for digit in no:
# print(digit)
if digit not in u:
u.append(digit)
# print(u)
#iF REPEAT IS THERE
if len(no) != len(u):
# print(no)
rep_list.append(no)
#If repeatation is not there
else:
nonrep_list.append(no)
print('Numbers which have no repeatation are=',rep_list)
print('Numbers which have repeatation are=',nonrep_list)

hash functions family generator in python

I am looking for a hash functions family generator that could generate a family of hash functions given a set of parameters. I haven't found any such generator so far.
Is there a way to do that with the hashlib package ?
For example I'd like to do something like :
h1 = hash_function(1)
h2 = hash_function(2)
...
and h1 and h2 would be different hash functions.
For those of you who might know about it, I am trying to implement a min-hashing algorithm on a very large dataset.
Basically, I have a very large set of features (100 millions to 1 billion) for a given document, and I need to create 1000 to 10000 different random permutations for this set of features.
I do NOT want to build the random permutations explicitly so the technique I would like to use in the following :
generate a hash function h and consider that for two indices r and s
r appears before s in the permutation if h(r) < h(s) and do that for 100 to 1000 different hash functions.
Are there any known libraries that I might have missed ? Or any standard way of generating families of hash functions with python that you might be aware of ?
I'd just do something like (if you don't need thread-safety -- not hard to alter if you DO need thread safety -- and assuming a 32-bit Python version):
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
As mentioned above, you can use universal hashing for minhash.
For example:
import random
def minhash():
d1 = set(random.randint(0, 2000) for _ in range(1000))
d2 = set(random.randint(0, 2000) for _ in range(1000))
jacc_sim = len(d1.intersection(d2)) / len(d1.union(d2))
print("jaccard similarity: {}".format(jacc_sim))
N_HASHES = 200
hash_funcs = []
for i in range(N_HASHES):
hash_funcs.append(universal_hashing())
m1 = [min([h(e) for e in d1]) for h in hash_funcs]
m2 = [min([h(e) for e in d2]) for h in hash_funcs]
minhash_sim = sum(int(m1[i] == m2[i]) for i in range(N_HASHES)) / N_HASHES
print("min-hash similarity: {}".format(minhash_sim))
def universal_hashing():
def rand_prime():
while True:
p = random.randrange(2 ** 32, 2 ** 34, 2)
if all(p % n != 0 for n in range(3, int((p ** 0.5) + 1), 2)):
return p
m = 2 ** 32 - 1
p = rand_prime()
a = random.randint(0, p)
if a % 2 == 0:
a += 1
b = random.randint(0, p)
def h(x):
return ((a * x + b) % p) % m
return h
Reference
#alex's answer is great and concise, but the hash functions it generates are not "very different from each other".
Let's look at the Pearson correlation between 10000 samples of 10000 hashes that put the results in 100 bins
%%time # 1min 14s
n=10000
hashes = [hash_function(i) for i in range(n)]
median_pvalue(hashes, n=n)
# 1.1614081043690444e-06
I.e. the median p_value is 1e-06 which is far from random. Here's an example if it were truly random :
%%time # 4min 15s
hashes = [lambda _ : random.randint(0,100) for _ in range(n)]
median_pvalue(hashes, n=n)
# 0.4979718236429698
Using Carter and Wegman method you could get:
%%time # 1min 43s
hashes = HashFamily(100).draw_hashes(n)
median_pvalue(hashes, n=n)
# 0.841929288037321
Code to reproduce :
from scipy.stats.stats import pearsonr
import numpy as np
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
class HashFamily():
r"""Universal hash family as proposed by Carter and Wegman.
.. math::
\begin{array}{ll}
h_{{a,b}}(x)=((ax+b)~{\bmod ~}p)~{\bmod ~}m \ \mid p > m\\
\end{array}
Args:
bins (int): Number of bins to hash to. Better if a prime number.
moduler (int,optional): Temporary hashing. Has to be a prime number.
"""
def __init__(self, bins, moduler=None):
if moduler and moduler <= bins:
raise ValueError("p (moduler) should be >> m (buckets)")
self.bins = bins
self.moduler = moduler if moduler else self._next_prime(np.random.randint(self.bins + 1, 2**32))
# do not allow same a and b, as it could mean shifted hashes
self.sampled_a = set()
self.sampled_b = set()
def _is_prime(self, x):
"""Naive is prime test."""
for i in range(2, int(np.sqrt(x))):
if x % i == 0:
return False
return True
def _next_prime(self, n):
"""Naively gets the next prime larger than n."""
while not self._is_prime(n):
n += 1
return n
def draw_hash(self, a=None, b=None):
"""Draws a single hash function from the family."""
if a is None:
while a is None or a in self.sampled_a:
a = np.random.randint(1, self.moduler - 1)
assert len(self.sampled_a) < self.moduler - 2, "please give a bigger moduler"
self.sampled_a.add(a)
if b is None:
while b is None or b in self.sampled_b:
b = np.random.randint(0, self.moduler - 1)
assert len(self.sampled_b) < self.moduler - 1, "please give a bigger moduler"
self.sampled_b.add(b)
return lambda x: ((a * x + b) % self.moduler) % self.bins
def draw_hashes(self, n, **kwargs):
"""Draws n hash function from the family."""
return [self.draw_hash() for i in range(n)]
def median_pvalue(hashes, buckets=100, n=1000):
p_values = []
for j in range(n-1):
a = [hashes[j](i) % buckets for i in range(n)]
b = [hashes[j+1](i) % buckets for i in range(n)]
p_values.append(pearsonr(a,b)[1])
return np.median(p_values)
Note that my implementation is of Carter and Wegman is very naive (e.g. generation of prime numbers). It could be made shorter and quicker.
You should consider using universal hashing. My answer and code can be found here: https://stackoverflow.com/a/25104050/207661
The universal hash family is a set of hash functions H of size m, such that any two (district) inputs collide with probability at most 1/m when the hash function h is drawn randomly from set H.
Based on the formulation in Wikipedia, use can use the following code:
import random
def is_prime(n):
if n==2 or n==3: return True
if n%2==0 or n<2: return False
for i in range(3, int(n**0.5)+1, 2):
if n%i==0:
return False
return True
# universal hash functions
class UniversalHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = min_value_for_prime_number
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p))
def __call__(self, function_index, input_integer):
a, b, p = self.hash_function_attrs[function_index]
return (((a*input_integer + b)%p)%self.number_of_buckets) + self.bucket_value_offset
Example usage:
We can create a hash family consists of 20 hash functions, each one map the input to 100 buckets.
hash_family = UniversalHashFamily(20, 100)
And get the hashed values like:
input_integer = 1234567890 # sample input
hash_family(0, input_integer) # the output of the first hash function, i.e. h0(input_integer)
hash_family(1, input_integer) # the output of the second hash function, i.e. h1(input_integer)
# ...
hash_family(19, input_integer) # the output of the last hash function, i.e. h19(input_integer)
If you are interested in the universal hash family for string inputs, you can use the following code. But please note that this code may not be the optimized solution for string hashing.
class UniversalStringHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = max(min_value_for_prime_number, number_of_buckets)
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
a2 = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p, a2))
def hash_int(self, int_to_hash, a, b, p):
return (((a*int_to_hash + b)%p)%self.number_of_buckets) + self.bucket_value_offset
def hash_str(self, str_to_hash, a, b, p, a2):
str_to_hash = "1" + str_to_hash # this will ensure that universality is not affected, see wikipedia for more detail
l = len(str_to_hash)-1
int_to_hash = 0
for i in range(l+1):
int_to_hash += ord(str_to_hash[i]) * (a2 ** (l-i))
int_to_hash = int_to_hash % p
return self.hash_int(int_to_hash, a, b, p)
def __call__(self, function_index, str_to_hash):
a, b, p, a2 = self.hash_function_attrs[function_index]
return self.hash_str(str_to_hash, a, b, p, a2)

Categories

Resources