Getting Test Case Run time error - python

Im working on this question(not Homework)
Given a list of unsorted integers,A = {a1, a2, ...,an) can you find the pair of elements that have the smallest absolute difference between them? If there are multiple pairs, find them all.
This is what i came up with:
num = int(input())
array = [int(x) for x in input().split()]
diff = []
for i in range(len(array)):
for j in array:
diff.append(array[i]- j)
total = []
for i in diff:
if i > 0:
total.append(i)
grail = min(total)
holy = []
for i in range(len(array)):
for j in array:
if ((array[i] - j) == grail):
holy.append(array[i])
holy.append(j)
final = sorted(holy)
for item in final:
print(item, end = ' ')
This runs for few cases but gets runtime error on large inputs,any suggestion i might try?
Ex:
Input = [-20 -3916237 -357920 -3620601 7374819 -7330761 30 6246457 -6461594 266854 -520 -470 ]
Output = -520 -470 -20 30
Explanation = (-470) - (-520) = 30 - (-20) = 50, which is the smallest difference.
Thanks in advance

I did not bother to check your code for correctness because the implementation has complexity of O(n^2)
for i in range(len(array)):
for j in array:
if ((array[i] - j) == grail):
holy.append(array[i])
holy.append(j)
The required answer needs to have preferable complexity of O(log n). For achieving that you need to sort the list upfront.
from unittest import TestCase
import unittest
from sys import maxsize
from itertools import tee
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return zip(a, b)
def solution(n):
n = sorted(n)
pairs = []
diff = maxsize
for l, u in pairwise(n):
if u - l <= diff:
diff = u - l
pairs.append((diff, (l,u)))
pairs = sorted(pairs)
least = pairs[0][0]
return list(map(lambda x: x[1], filter(lambda x: x[0] == least, pairs)))
class TestLeastDiffrence(TestCase):
def testSimple(self):
n = [-20, -3916237, -357920, -3620601, 7374819, -7330761, 30, 6246457, -6461594, 266854, -520, -470]
self.assertEqual(solution(n),[(-520, -470), (-20, 30)])
if __name__ == '__main__':
unittest.main()

Related

Find the number in a given range so that the gcd of the number with any element of a given list will always be 1

Given a number M and a list A which contains N elements (A1, A2,...)
Find the all the numbers k so that:
1=<k=<M which satisfied gcd(Ai, k) is always equal to 1
Here's my code, the only problem for it is that it uses loops in each other, which slow the process if my inputs are big, how can I fix it so that it requires less time?
N, M = [int(v) for v in input().split()]
A = [int(v) for v in input().split()]
from math import gcd
cnt = 0
print(N)
for k in range(1, M+1):
for i in range(N):
if gcd(k, A[i]) == 1:
cnt += 1
if cnt == N:
print(k)
cnt = 0
inputs example: (first line contains N and M, second contains the list A1, A2,...)
3 12
6 1 5
Here's a fast version that eliminates the nested loops:
N, M = [int(v) for v in input().split()]
A = [int(v) for v in input().split()]
from math import gcd
print(N)
l = 1
for v in A:
l = l*v//gcd(l, v)
for k in range(1, M+1):
if gcd(l, k) == 1:
print(k)
It works by first taking the LCM, l, of the values in A. It then suffices to check if the GCD of k and l is 1, which means there are no common factors with any of the values in A.
Note: If you're using a newer version of Python than I am (3.9 or later), you can import lcm from math and replace l = l*v//gcd(l, v) with l = lcm(l, v).
Or, as Kelly Bundy pointed out, lcm accepts an arbitrary number of arguments, so the first loop can be replaced with l = lcm(*A) if you're using 3.9 or later.
Just another approach using sympy.theory, factorint and Python sets which from the point of view of speed has on my machine no advantage compared to the math.lcm() or the math.gcd() based solutions if applied to small sizes of lists and numbers, but excels at very large size of randomized list:
M = 12
lstA = (6, 1, 5)
from sympy.ntheory import factorint
lstAfactors = []
for a in lstA:
lstAfactors += factorint(a)
setA = set(lstAfactors)
for k in range(1, M+1):
if not (set(factorint(k)) & setA):
print(k)
The code above implements the idea described in the answer of Yatisi coded by Tom Karzes using math.gcd(), but is using sympy.ntheory factorint() and set() instead of math gcd().
In terms of speed the factorint() solution seems to be fastest on the below tested data:
# ======================================================================
from time import perf_counter as T
from math import gcd, lcm
from sympy import factorint
from random import choice
#M = 3000
#lstA = 100 * [6, 12, 18, 121, 256, 1024, 361, 2123, 39]
M = 8000
lstA = [ choice(range(1,8000)) for _ in range(8000) ]
# ----------------------------------------------------------------------
from sympy.ntheory import factorint
lstResults = []
lstAfactors = []
sT=T()
for a in lstA:
lstAfactors += factorint(a)
setA = set(lstAfactors)
for k in range(1, M+1):
if not (set(factorint(k)) & setA):
lstResults += [k]
print("factorint:", T()-sT)
#print(lstResults)
print("---")
# ----------------------------------------------------------------------
lstResults = []
sT=T()
#l = 1
#for a in lstA:
# l = (l*a)//gcd(l, a) # can be replaced by:
l = lcm(*lstA) # least common multiple divisible by all lstA items
# ^-- which runs MAYBE a bit faster than the loop with gcd()
for k in range(1, M+1):
if gcd(l, k) == 1:
lstResults += [k]
print("lcm() :", T()-sT)
#print(lstResults)
print("---")
# ----------------------------------------------------------------------
lstResults = []
sT=T()
l = 1
for a in lstA:
l = (l*a)//gcd(l, a) # can be replaced by:
#l = lcm(*lstA) # least common multiple divisible by all lstA items
# ^-- which runs MAYBE a bit faster than the loop with gcd()
for k in range(1, M+1):
if gcd(l, k) == 1:
lstResults += [k]
print("gcd() :", T()-sT)
#print(lstResults)
print("---")
# ----------------------------------------------------------------------
import numpy as np
A = np.array(lstA)
def find_gcd_np(M, A, to_gcd=1):
vals = np.arange(1, M + 1)
return vals[np.all(np.gcd(vals, np.array(A)[:, None]) == to_gcd, axis=0)]
sT=T()
lstResults = find_gcd_np(M, A, 1).tolist()
print("numpy :", T()-sT)
#print(lstResults)
print("---")
printing
factorint: 0.09754624799825251
---
lcm() : 0.10102138598449528
---
gcd() : 0.10236155497841537
---
numpy : 6.923375226906501
---
The timing results change extremely for the second data variant in the code provided above printing:
factorint: 0.021642255946062505
---
lcm() : 0.0010238440008834004
---
gcd() : 0.0013772319070994854
---
numpy : 0.19953695288859308
---
where the factorint based approach is 20x and the numpy based approach 200x times slower than the gcd/lcm based one.
Run the timing test yourself online. It won't run the case of large data, but it can at least demonstrate that the numpy approach is 100x times slower than the gcd one:
factorint: 0.03271647123619914
---
lcm() : 0.003286922350525856
---
gcd() : 0.0029655308462679386
---
numpy : 0.41759901121258736
1 https://ato.pxeger.com/run?1=3VXBitswED0W9BXDGoq16-zaSbtsAzmk0GN6KLmUkAbFkdcismQkZbc-9Et62Uv7Uf2ajiwnNt1Du7ClUIORpXmaefNmLH39Xjeu1Orh4dvBFaObHy--RDB7locURlfgRMUBQFS1Ng5qbopNrg_KcQPMwjKAKubKHnSb7xKQeRVstqnq5mQrWO60EcoFo2Fqh0NnzEstck6iBfqCGUzSNCWRtG6OkyxN4RxW1wlkY3xv_JglMH7tV9LxqwQm136ejSf4-WZNhk46H6suQoxhb3mcJTdpSikU2sAGhIKw7BdhTUgEo2d5BjJcKldybZrHaiDDD9wepLOe9GrdUg5mGxbscraMKfFkmSfrAVOCOQ6RF7PeZ8wosbxNHId4AAte9n3KKNziIqOtOxAFKO0g9pt6Z3sU6qV3NO9gEEIfWWPk1X5N6hZ8dto3PUsAaY_skpIoGPtN9AhHlc7oMyo-4DXULpK-kUj0i4ZRmwqaYnnO6NUV9m8sE2AUIsiZgi0Hw2vJcr6DbTMF4rHY3_G53-9RkjOL7aurSiuoMK6oJYeduBNWbPFr2wCTsg0HwvHKYsxPoxHclyIvwRyUhcX849t3yGorfFtY_3-5EoNjw4DUuoZ7gf-Yp_bb6nX89xQPAsj-oFo-F-oRT6nW3y5WqNXjdn9SpaL_rVSt139Wqu7YUgd_pOPxr2rijxdVXzJjWNOeMZTseAGFULsNkt2oOl4kME_AaT-fHZO_Y9Iet56kgQvIaGs23B2MalErj5EyxsFn75eSPuScrqYJvNeKr1sRQxjsic_CzlLat9Owyx6zy-il01JYF5-0C1k-UepwC3eX8fFS_gk)
This is probably more a math question than a programming question, however, here comes my take: Depending on M and A, it might be better to
Find the prime divisors of the Ai (have a look at this) and put them in a set.
Either remove (sieve) all multiples of these primes from list(range(1,M+1)), which you can do (more) efficiently by smart ordering, or find all primes smaller or equal to M (which could even be pre-computed) that are not divisors of any Ai and compute all multiples up to M.
Explanation: Since gcd(Ai,k)=1 if and only if Ai and k have no common divisors, they also have no prime divisors. Thus, we can first find all prime divisors of the Ai and then make sure our k don't have any of them as divisors, too.
Using numpy with vectorised operations will be a good alternative when your input range M goes up to hundreds and higher and A is stably small (is about as your current A):
import numpy as np
def find_gcd_np(M, A, to_gcd=1):
vals = np.arange(1, M + 1)
return vals[np.all(np.gcd(vals, np.array(A)[:, None]) == to_gcd, axis=0)]
Usage:
print(find_gcd_np(100, [6, 1, 5], 1))

Is there a pythonic way to sample N consecutive elements from a list or numpy array

Is there a pythonic way to select N consecutive elements from a list or numpy array.
So Suppose:
Choice = [1,2,3,4,5,6]
I would like to create a new list of length N by randomly selecting element X in Choice along with the N-1 consecutive elements following choice.
So if:
X = 4
N = 4
The resulting list would be:
Selection = [5,6,1,2]
I think something similar to the following would work.
S = []
for i in range(X,X+N):
S.append(Selection[i%6])
But I was wondering if there is a python or numpy function that can select the elements at once that was more efficient.
Use itertools, specifically islice and cycle.
start = random.randint(0, len(Choice) - 1)
list(islice(cycle(Choice), start, start + n))
cycle(Choice) is an infinite sequence that repeats your original list, so that the slice start:start + n will wrap if necessary.
You could use a list comprehension, using modulo operations on the index to keep it in range of the list:
Choice = [1,2,3,4,5,6]
X = 4
N = 4
L = len(Choice)
Selection = [Choice[i % L] for i in range(X, X+N)]
print(Selection)
Output
[5, 6, 1, 2]
Note that if N is less than or equal to len(Choice), you can greatly simplify the code:
Choice = [1,2,3,4,5,6]
X = 4
N = 4
L = len(Choice)
Selection = Choice[X:X+N] if X+N <= L else Choice[X:] + Choice[:X+N-L]
print(Selection)
Since you are asking for the most efficient way I created a little benchmark to test the solutions proposed in this thread.
I rewrote your current solution as:
def op(choice, x):
n = len(choice)
selection = []
for i in range(x, x + n):
selection.append(choice[i % n])
return selection
Where choice is the input list and x is the random index.
These are the results if choice contains 1_000_000 random numbers:
chepner: 0.10840400000000017 s
nick: 0.2066781999999998 s
op: 0.25887470000000024 s
fountainhead: 0.3679908000000003 s
Full code
import random
from itertools import cycle, islice
from time import perf_counter as pc
import numpy as np
def op(choice, x):
n = len(choice)
selection = []
for i in range(x, x + n):
selection.append(choice[i % n])
return selection
def nick(choice, x):
n = len(choice)
return [choice[i % n] for i in range(x, x + n)]
def fountainhead(choice, x):
n = len(choice)
return np.take(choice, range(x, x + n), mode='wrap')
def chepner(choice, x):
n = len(choice)
return list(islice(cycle(choice), x, x + n))
results = []
n = 1_000_000
choice = random.sample(range(n), n)
x = random.randint(0, n - 1)
# Correctness
assert op(choice, x) == nick(choice,x) == chepner(choice,x) == list(fountainhead(choice,x))
# Benchmark
for f in op, nick, chepner, fountainhead:
t0 = pc()
f(choice, x)
t1 = pc()
results.append((t1 - t0, f))
for t, f in sorted(results):
print(f'{f.__name__}: {t} s')
If using a numpy array as the source, we could of course use numpy "fancy indexing".
So, if ChoiceArray is the numpy array equivalent of the list Choice, and if L is len(Choice) or len(ChoiceArray):
Selection = ChoiceArray [np.arange(X, N+X) % L]
Here's a numpy approach:
import numpy as np
Selection = np.take(Choice, range(X,N+X), mode='wrap')
Works even if Choice is a Python list rather than a numpy array.

Iterative merge sort?

I am aware of classical recursive approach to sort something by merging.
It yields O(n * log(n)) complexity, which can be more or less easily shown via recurrence relation.
I've tried to reimplement merge sort in iterative fashion:
def atomize(l):
return list(
map(
lambda x: [x],
l if l is not None else []
)
)
def merge(l, r):
res = []
while (len(l) + len(r)) > 0:
if len(l) < 1:
res += r
r = []
elif len(r) < 1:
res += l
l = []
else:
if l[0] <= r[0]:
res.append(l.pop(0))
else:
res.append(r.pop(0))
return res
def iter_merge_sort(l):
atoms = atomize(l) # O(n)
while len(atoms) > 1: # O(n - 1)
atoms.append(merge(atoms.pop(0), atoms.pop(0)))
return atoms[0]
...and feels like I am mistaken somewhere, yet I fail to notice exact place. Recursive merge sort splits problem unless list of unsorted values reduces to a list of singletons - single elements that can be compared. That's what atomize(...) does: given a list, produces a list of lists-singletons (order O(n)).
Obviously, merge(...) is O(n) as well: ignore for moment that no linked lists are used for concatenation, that's not important here.
Finally.. the while block in the iter_merge_sort(...) itself takes exactly n - 1 repetitions, each of which costs at most O(n). Hence, I took O(n * log(n)) algorithm and "improved" it to be (n - 1) * n ~ O(n * n). Where is my mistake?
Your algorithm is entirely correct. The problem lies in that you're using list.pop(0) as a way to dequeue, which costs O(n) in Python since all items after a popped item of a list have to be copied to the preceding positions.
You can use collections.deque in place of list so that you can use the deque.popleft method, which costs O(1):
from collections import deque
def atomize(l):
return deque(
map(
lambda x: deque([x]),
l if l is not None else []
)
)
def merge(l, r):
res = deque()
while (len(l) + len(r)) > 0:
if len(l) < 1:
res += r
r = deque()
elif len(r) < 1:
res += l
l = deque()
else:
if l[0] <= r[0]:
res.append(l.popleft())
else:
res.append(r.popleft())
return res
def iter_merge_sort(l):
atoms = atomize(l) # O(n)
while len(atoms) > 1: # O(n - 1)
atoms.append(merge(atoms.popleft(), atoms.popleft()))
return list(atoms[0])
so that:
iter_merge_sort([3,5,1,6,2,1])
returns:
[1, 1, 2, 3, 5, 6]

finding minimal difference

I have an array A=[a1,a2,a3,a4,a5...] and I want to find two elements of the array, say A[i] and A[j] such that i is less than j and A[j]-A[i] is minimal.
Would this code do the job:
def findMinDifference(A):
Unsorted=[]
minDiff=1000000
Unsorted=A
Sorted=quickSort(A)
for i in range(0,len(Sorted)):
if i>=1:
SmallElement=Sorted[i-1]
indexOfSmaller=Unsorted.index(SmallElement)
BigElement=Sorted[i]
indexOfBig=Unsorted.index(BigElement)
if indexOfSmaller<inexOfBig:
diff=Sorted[i]-Sorted[i-1]
if diff<minDiff:
minDiff=diff
return minDiff
Your code can be updated a bit:
a = [1,2,5,9,10,20,21,45]
a, size = sorted(a), len(a)
res = [a[i + 1] - a[i] for i in xrange(size) if i+1 < size]
print "MinDiff: {0}, MaxDiff: {1}.".format(min(res), max(res))
In two words - finding min or max diff can be simplified as getting min/max element of a list that consist of differences for each pair of elements from the sorted original list of values
Using itertools pairwise recipe:
>>> from itertools import tee, izip
>>> def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return izip(a, b)
>>> nums = [1, 3, 7, 13, 9, 18, 22]
>>> min(pairwise(sorted(nums)), key=lambda x: x[1] - x[0])
(1, 3)
Not sure why the sort. You can adapt this pseudocode.
for i = 0; i < array.length; i++
for j = i + 1; j < array.length; j++
if a[j] - a[i] < min
min = a[j] - a[i]
return min
This is another approach, using more iterables and more relying on defaults:
from itertools import imap, islice, izip
def find_min_diff(iterable, sort_func=sorted):
sorted_iterable = sort_func(iterable)
return min(imap(
lambda a, b: b - a,
izip(sorted_iterable, islice(sorted_iterable, 1)),
))

hash functions family generator in python

I am looking for a hash functions family generator that could generate a family of hash functions given a set of parameters. I haven't found any such generator so far.
Is there a way to do that with the hashlib package ?
For example I'd like to do something like :
h1 = hash_function(1)
h2 = hash_function(2)
...
and h1 and h2 would be different hash functions.
For those of you who might know about it, I am trying to implement a min-hashing algorithm on a very large dataset.
Basically, I have a very large set of features (100 millions to 1 billion) for a given document, and I need to create 1000 to 10000 different random permutations for this set of features.
I do NOT want to build the random permutations explicitly so the technique I would like to use in the following :
generate a hash function h and consider that for two indices r and s
r appears before s in the permutation if h(r) < h(s) and do that for 100 to 1000 different hash functions.
Are there any known libraries that I might have missed ? Or any standard way of generating families of hash functions with python that you might be aware of ?
I'd just do something like (if you don't need thread-safety -- not hard to alter if you DO need thread safety -- and assuming a 32-bit Python version):
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
As mentioned above, you can use universal hashing for minhash.
For example:
import random
def minhash():
d1 = set(random.randint(0, 2000) for _ in range(1000))
d2 = set(random.randint(0, 2000) for _ in range(1000))
jacc_sim = len(d1.intersection(d2)) / len(d1.union(d2))
print("jaccard similarity: {}".format(jacc_sim))
N_HASHES = 200
hash_funcs = []
for i in range(N_HASHES):
hash_funcs.append(universal_hashing())
m1 = [min([h(e) for e in d1]) for h in hash_funcs]
m2 = [min([h(e) for e in d2]) for h in hash_funcs]
minhash_sim = sum(int(m1[i] == m2[i]) for i in range(N_HASHES)) / N_HASHES
print("min-hash similarity: {}".format(minhash_sim))
def universal_hashing():
def rand_prime():
while True:
p = random.randrange(2 ** 32, 2 ** 34, 2)
if all(p % n != 0 for n in range(3, int((p ** 0.5) + 1), 2)):
return p
m = 2 ** 32 - 1
p = rand_prime()
a = random.randint(0, p)
if a % 2 == 0:
a += 1
b = random.randint(0, p)
def h(x):
return ((a * x + b) % p) % m
return h
Reference
#alex's answer is great and concise, but the hash functions it generates are not "very different from each other".
Let's look at the Pearson correlation between 10000 samples of 10000 hashes that put the results in 100 bins
%%time # 1min 14s
n=10000
hashes = [hash_function(i) for i in range(n)]
median_pvalue(hashes, n=n)
# 1.1614081043690444e-06
I.e. the median p_value is 1e-06 which is far from random. Here's an example if it were truly random :
%%time # 4min 15s
hashes = [lambda _ : random.randint(0,100) for _ in range(n)]
median_pvalue(hashes, n=n)
# 0.4979718236429698
Using Carter and Wegman method you could get:
%%time # 1min 43s
hashes = HashFamily(100).draw_hashes(n)
median_pvalue(hashes, n=n)
# 0.841929288037321
Code to reproduce :
from scipy.stats.stats import pearsonr
import numpy as np
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
class HashFamily():
r"""Universal hash family as proposed by Carter and Wegman.
.. math::
\begin{array}{ll}
h_{{a,b}}(x)=((ax+b)~{\bmod ~}p)~{\bmod ~}m \ \mid p > m\\
\end{array}
Args:
bins (int): Number of bins to hash to. Better if a prime number.
moduler (int,optional): Temporary hashing. Has to be a prime number.
"""
def __init__(self, bins, moduler=None):
if moduler and moduler <= bins:
raise ValueError("p (moduler) should be >> m (buckets)")
self.bins = bins
self.moduler = moduler if moduler else self._next_prime(np.random.randint(self.bins + 1, 2**32))
# do not allow same a and b, as it could mean shifted hashes
self.sampled_a = set()
self.sampled_b = set()
def _is_prime(self, x):
"""Naive is prime test."""
for i in range(2, int(np.sqrt(x))):
if x % i == 0:
return False
return True
def _next_prime(self, n):
"""Naively gets the next prime larger than n."""
while not self._is_prime(n):
n += 1
return n
def draw_hash(self, a=None, b=None):
"""Draws a single hash function from the family."""
if a is None:
while a is None or a in self.sampled_a:
a = np.random.randint(1, self.moduler - 1)
assert len(self.sampled_a) < self.moduler - 2, "please give a bigger moduler"
self.sampled_a.add(a)
if b is None:
while b is None or b in self.sampled_b:
b = np.random.randint(0, self.moduler - 1)
assert len(self.sampled_b) < self.moduler - 1, "please give a bigger moduler"
self.sampled_b.add(b)
return lambda x: ((a * x + b) % self.moduler) % self.bins
def draw_hashes(self, n, **kwargs):
"""Draws n hash function from the family."""
return [self.draw_hash() for i in range(n)]
def median_pvalue(hashes, buckets=100, n=1000):
p_values = []
for j in range(n-1):
a = [hashes[j](i) % buckets for i in range(n)]
b = [hashes[j+1](i) % buckets for i in range(n)]
p_values.append(pearsonr(a,b)[1])
return np.median(p_values)
Note that my implementation is of Carter and Wegman is very naive (e.g. generation of prime numbers). It could be made shorter and quicker.
You should consider using universal hashing. My answer and code can be found here: https://stackoverflow.com/a/25104050/207661
The universal hash family is a set of hash functions H of size m, such that any two (district) inputs collide with probability at most 1/m when the hash function h is drawn randomly from set H.
Based on the formulation in Wikipedia, use can use the following code:
import random
def is_prime(n):
if n==2 or n==3: return True
if n%2==0 or n<2: return False
for i in range(3, int(n**0.5)+1, 2):
if n%i==0:
return False
return True
# universal hash functions
class UniversalHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = min_value_for_prime_number
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p))
def __call__(self, function_index, input_integer):
a, b, p = self.hash_function_attrs[function_index]
return (((a*input_integer + b)%p)%self.number_of_buckets) + self.bucket_value_offset
Example usage:
We can create a hash family consists of 20 hash functions, each one map the input to 100 buckets.
hash_family = UniversalHashFamily(20, 100)
And get the hashed values like:
input_integer = 1234567890 # sample input
hash_family(0, input_integer) # the output of the first hash function, i.e. h0(input_integer)
hash_family(1, input_integer) # the output of the second hash function, i.e. h1(input_integer)
# ...
hash_family(19, input_integer) # the output of the last hash function, i.e. h19(input_integer)
If you are interested in the universal hash family for string inputs, you can use the following code. But please note that this code may not be the optimized solution for string hashing.
class UniversalStringHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = max(min_value_for_prime_number, number_of_buckets)
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
a2 = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p, a2))
def hash_int(self, int_to_hash, a, b, p):
return (((a*int_to_hash + b)%p)%self.number_of_buckets) + self.bucket_value_offset
def hash_str(self, str_to_hash, a, b, p, a2):
str_to_hash = "1" + str_to_hash # this will ensure that universality is not affected, see wikipedia for more detail
l = len(str_to_hash)-1
int_to_hash = 0
for i in range(l+1):
int_to_hash += ord(str_to_hash[i]) * (a2 ** (l-i))
int_to_hash = int_to_hash % p
return self.hash_int(int_to_hash, a, b, p)
def __call__(self, function_index, str_to_hash):
a, b, p, a2 = self.hash_function_attrs[function_index]
return self.hash_str(str_to_hash, a, b, p, a2)

Categories

Resources