Binary polynomials modulo - Python - python

I have binary polynomials, which I represent like binary number. For example
a = 0b10011
b = 0b101
a is x^4+x+1 and b is x^2+1. So I want that
a%b = 2 # 10 as polynomial x
I would like to ask, how can I do it? I think that standard operation % of two polynomials will not work.

Here's a simple idea, given a normal polynomial division routine you could create a custom class to represent binary polynomials and then just override the (%) operator, maybe something like this:
from math import fabs
def poly_div(p1, p2):
def degree(poly):
while poly and poly[-1] == 0:
poly.pop()
return len(poly)-1
p2_degree = degree(p2)
p1_degree = degree(p1)
if p2_degree < 0:
raise ZeroDivisionError
if p1_degree >= p2_degree:
q = [0] * p1_degree
while p1_degree >= p2_degree:
d = [0]*(p1_degree - p2_degree) + p2
mult = q[p1_degree - p2_degree] = p1[-1] / float(d[-1])
d = [coeff*mult for coeff in d]
p1 = [fabs(p1_c - p2_c) for p1_c, p2_c in zip(p1, d)]
p1_degree = degree(p1)
r = p1
else:
q = [0]
r = p1
return q, r
class BinPoly:
def __init__(self, poly):
self.poly = [int(bit) for bit in list(poly)]
def __mod__(self, other):
return poly_div(self.poly, other.poly)
if __name__ == '__main__':
a = BinPoly('10011')
b = BinPoly('101')
print(a%b)
As you can see, you're constructing the polynomials out of string, tweaking the class to use binary numbers instead shouldn't be too hard, left as an exercise to the reader ;)

Related

Why is my RSA key function returning nan?

I am currently working on a project replicating RSA key generation and testing using euclidean algorithm, extended euclidean algorithm to find the modular inverse of the value.
I used the Miller-Rabin test to choose two prime numbers, p and q.
After running the code, I am able to obtain Kpub and e, however Kpr returns as nan.
Please help!
#Euclidean Algorithm func
def EucAlgo(a, b):
if a==0:
return b
return EucAlgo(b % a,a)
def ExEucAlgo(a,b):
if a==0:
return b,0,1
gcd, s1, t1 = ExEucAlgo(b%a,a)
#gcd of a,b
s = t1 - (b/a) * s1
t = s1
return gcd, s, t
def ExEucAlgo_modInverse(a,b):
gcd, s, t = ExEucAlgo(b,a)
if (gcd == 1):
i = t % a
elif (gcd !=1):
print("There is no inverse modulo for the input.")
return i
def SqMul_ModularExpo(b, exp, n):
bin_exp = bin(exp)
base = b
for i in range (3, len(bin_exp)):
base = (base ** 2) % n
if(bin_exp[i]=='1'):
i+=1
base = (base * b) %n
return base
#RSA Key generation
p=9054583561027584891319616491815785011595937977633787663340258672121877196627062461308487615739189212918799813327175451021729047602129396754172486202100997
q=10115395220079214686776355235686624745626962891667413288473649946208213820942557513105240135405981494333016032659525466362014175268953946332375459648688023
n= p * q
phi_n= (p-1) * (q-1)
e= randint(1, phi_n - 1)
while((EucAlgo(e,phi_n)) !=1):
e = randint(1, (phi_n-1))
d = ExEucAlgo_modInverse(e,phi_n)
print(f"\nKpr={d}")
print(f"\nKpub=(n={n})\n \ne={e}")
The problem is that you are using float point division which will result in returning float a point which when dealing with large int can result in very large floats which python can't handle so the solution is to use integer division which means 5//2=2 not 2.5. The problem is that Now encrypting and decrypting data would result in wrong decryption. (You wont get 2 again) because of some bugs in your functions.
FIRST: use public exponent pf 65537(prime number) which is the default for all RSA implementations(see your browser certificates) rather than finding a random one. Then after calculating the extended Euclidean algorithm which is used to find modulo inverse you dont have to make any more calculations(just return this value if GCD is 1 otherwise raise an error or whatever).
Here is the complete code that works after removing some unneeded (functions, imports, and random public exponent) READ comments.
def EucAlgo(a, b):
if a == 0:
return b
return EucAlgo(b % a, a)
def ExEucAlgo(a,b):
if a==0:
return b, 0, 1
gcd, s1, t1 = ExEucAlgo(b%a, a)
# You dont use / use // to make integer division
s = t1 - (b//a) * s1
t = s1
return gcd, s, t
def ExEucAlgo_modInverse(a,b):
gcd, s, t = ExEucAlgo(a, b)
if (gcd == 1):
# Just return s which is the inverse of public exponent
return s
elif (gcd != 1):
# I think it's better to raise an error but it's up to you
print("There is no inverse modulo for the input.")
#RSA Key generation
p = 9054583561027584891319616491815785011595937977633787663340258672121877196627062461308487615739189212918799813327175451021729047602129396754172486202100997
q = 10115395220079214686776355235686624745626962891667413288473649946208213820942557513105240135405981494333016032659525466362014175268953946332375459648688023
n = p * q
phi_n = (p-1) * (q-1)
# Just use fixed prime public exponent rather than trying fixed ones
e = 65537
d = ExEucAlgo_modInverse(e, phi_n)
print(f"\nKpr={d}")
print(f"\nKpub=(n={n})\n \ne={e}")
# Try to encrypt and decrypt 36
ciphertext = pow(36, e, n)
print("Encrypted data {}".format(ciphertext))
print("Decrypted data is {}".format(pow(ciphertext, d, n)))

python fuction finding root( or zero) with minimum distance from real root epsilon

So its and exercises for python i am totally stuck! you have a random Function in [a,b] you already know that the a is negative and b is positive and it has only ONE root. The true root is : -0.94564927392359 and you have to make a
def that will find the root( or zero ) that will be closest to the true root with minimum difference eps.The eps is 1e-8 or 1e-6.Note that we don't know the true root, before was an example to understand what the number we are looking for is about. Also we are given the above :
import math
def fnc(x):
""" This the function in which root we are looking for """
global a, b, eps
if not hasattr(fnc, "counter"):
fnc.counter = 0
fnc.maxtimes = (int)(0.1+math.ceil(math.log((b-a)/eps, 2.0)+2))
if fnc.counter<fnc.maxtimes:
fnc.counter += 1
return x*x*x-x-0.1
else:
return 0.0 ##
WE have to start with this :
def root(f, a, b, eps):
(sorry for my English )
Just simple bisect:
from __future__ import division
import math
def func(x):
return x*x*x-x-0.1
def sign(n):
try:
return n/abs(n)
except ZeroDivisionError:
return 0
def root(f, a, b, eps=1e-6):
f_a = f(a)
if abs(f_a) < eps:
return a
f_b = f(b)
if abs(f_b) < eps:
return b
half = (b+a)/2
f_half = f(half)
if sign(f_half) != sign(f_a):
return root(f, a, half, eps)
else:
return root(f, half, b, eps)
print root(func, -1.5, -0.5, 1e-8) # -0.945649273694
See if the following heuristic of iteratively slicing intervals into 2 equals and then choosing the admissible half is suitable for you.
def root(fnc, a, b, eps = 1e-8, maxtimes = None):
if maxtimes == None: maxtimes = (int)(0.1+math.ceil(math.log((b-a)/eps, 2.0)+2))
for counter in xrange(maxtimes+1) : # a was assumed negative and b positive
if fnc(a) > -eps : return a, -fnc(a)
if fnc(b) < eps : return b, fnc(b)
new_bound = (a + b)/2.0
print a, b, new_bound
if fnc(new_bound) < 0 : a = new_bound
else : b = new_bound
return new_bound, min(-fnc(a),fnc(b))
and then
fnc = lambda x : x**3-x-0.1
result = root(fnc, 0, 2, 1e-6)
print "root = ", result[0], "error = ", result[1]

Fastest possible method for the arcsin function on small, arbitrary floating-point values

I need to calculate the arcsine function of small values that are under the form of mpmath's "mpf" floating-point bignums.
What I call a "small" value is for example e/4/(10**7) = 0.000000067957045711476130884...
Here is a result of a test on my machine with mpmath's built-in asin function:
import gmpy2
from mpmath import *
from time import time
mp.dps = 10**6
val=e/4/(10**7)
print "ready"
start=time()
temp=asin(val)
print "mpmath asin: "+str(time()-start)+" seconds"
>>> 155.108999968 seconds
This is a particular case: I work with somewhat small numbers, so I'm asking myself if there is a way to calculate it in python that actually beats mpmath for this particular case (= for small values).
Taylor series are actually a good choice here because they converge very fast for small arguments. But I still need to accelerate the calculations further somehow.
Actually there are some problems:
1) Binary splitting is ineffective here because it shines only when you can write the argument as a small fraction. A full-precision float is given here.
2) arcsin is a non-alternating series, thus Van Wijngaarden or sumalt transformations are ineffective too (unless there is a way I'm not aware of to generalize them to non-alternating series).
https://en.wikipedia.org/wiki/Van_Wijngaarden_transformation
The only acceleration left I can think of is Chebyshev polynomials. Can Chebyshev polynomials be applied on the arcsin function? How to?
Can you use the mpfr type that is included in gmpy2?
>>> import gmpy2
>>> gmpy2.get_context().precision = 3100000
>>> val = gmpy2.exp(1)/4/10**7
>>> from time import time
>>> start=time();r=gmpy2.asin(val);print time()-start
3.36188197136
In addition to supporting the GMP library, gmpy2 also supports the MPFR and MPC multiple-precision libraries.
Disclaimer: I maintain gmpy2.
Actually binary splitting does work very well, if combined with iterated argument reduction to balance the number of terms against the size of the numerators and denominators (this is known as the bit-burst algorithm).
Here is a binary splitting implementation for mpmath based on repeated application of the formula atan(t) = atan(p/2^q) + atan((t*2^q-p) / (2^q+p*t)). This formula was suggested recently by Richard Brent (in fact mpmath's atan already uses a single invocation of this formula at low precision, in order to look up atan(p/2^q) from a cache). If I remember correctly, MPFR also uses the bit-burst algorithm to evaluate atan, but it uses a slightly different formula, which possibly is more efficient (instead of evaluating several different arctangent values, it does analytic continuation using the arctangent differential equation).
from mpmath.libmp import MPZ, bitcount
from mpmath import mp
def bsplit(p, q, a, b):
if b - a == 1:
if a == 0:
P = p
Q = q
else:
P = p * p
Q = q * 2
B = MPZ(1 + 2 * a)
if a % 2 == 1:
B = -B
T = P
return P, Q, B, T
else:
m = a + (b - a) // 2
P1, Q1, B1, T1 = bsplit(p, q, a, m)
P2, Q2, B2, T2 = bsplit(p, q, m, b)
T = ((T1 * B2) << Q2) + T2 * B1 * P1
P = P1 * P2
B = B1 * B2
Q = Q1 + Q2
return P, Q, B, T
def atan_bsplit(p, q, prec):
"""computes atan(p/2^q) as a fixed-point number"""
if p == 0:
return MPZ(0)
# FIXME
nterms = (-prec / (bitcount(p) - q) - 1) * 0.5
nterms = int(nterms) + 1
if nterms < 1:
return MPZ(0)
P, Q, B, T = bsplit(p, q, 0, nterms)
if prec >= Q:
return (T << (prec - Q)) // B
else:
return T // (B << (Q - prec))
def atan_fixed(x, prec):
t = MPZ(x)
s = MPZ(0)
q = 1
while t:
q = min(q, prec)
p = t >> (prec - q)
if p:
s += atan_bsplit(p, q, prec)
u = (t << q) - (p << prec)
v = (MPZ(1) << (q + prec)) + p * t
t = (u << prec) // v
q *= 2
return s
def atan1(x):
prec = mp.prec
man = x.to_fixed(prec)
return mp.mpf((atan_fixed(man, prec), -prec))
def asin1(x):
x = mpf(x)
return atan1(x/sqrt(1-x**2))
With this code, I get:
>>> from mpmath import *
>>> mp.dps = 1000000
>>> val=e/4/(10**7)
>>> from time import time
>>> start = time(); y1 = asin(x); print time() - start
58.8485069275
>>> start = time(); y2 = asin1(x); print time() - start
8.26498985291
>>> nprint(y2 - y1)
-2.31674e-1000000
Warning: atan1 assumes 0 <= x < 1/2, and the determination of the number of terms might not be optimal or correct (fixing these issues is left as an exercise to the reader).
A fast way is to use a pre-calculated look-up table.
But if you look at e.g. a Taylor series for asin;
def asin(x):
rv = (x + 1/3.0*x**3 + 7/30.0*x**5 + 64/315.0*x**7 + 4477/22680.0*x**9 +
28447/138600.0*x**11 + 23029/102960.0*x**13 +
17905882/70945875.0*x**15 + 1158176431/3958416000.0*x**17 +
9149187845813/26398676304000.0*x**19)
return rv
You'll see that for small values of x, asin(x) ≈ x.
In [19]: asin(1e-7)
Out[19]: 1.0000000000000033e-07
In [20]: asin(1e-9)
Out[20]: 1e-09
In [21]: asin(1e-11)
Out[21]: 1e-11
In [22]: asin(1e-12)
Out[22]: 1e-12
E.g. for the value us used:
In [23]: asin(0.000000067957045711476130884)
Out[23]: 6.795704571147624e-08
In [24]: asin(0.000000067957045711476130884)/0.000000067957045711476130884
Out[24]: 1.0000000000000016
Of course it depends on whether this difference is relevant to you.

Modular multiplicative inverse function in Python

Does some standard Python module contain a function to compute modular multiplicative inverse of a number, i.e. a number y = invmod(x, p) such that x*y == 1 (mod p)? Google doesn't seem to give any good hints on this.
Of course, one can come up with home-brewed 10-liner of extended Euclidean algorithm, but why reinvent the wheel.
For example, Java's BigInteger has modInverse method. Doesn't Python have something similar?
Python 3.8+
y = pow(x, -1, p)
Python 3.7 and earlier
Maybe someone will find this useful (from wikibooks):
def egcd(a, b):
if a == 0:
return (b, 0, 1)
else:
g, y, x = egcd(b % a, a)
return (g, x - (b // a) * y, y)
def modinv(a, m):
g, x, y = egcd(a, m)
if g != 1:
raise Exception('modular inverse does not exist')
else:
return x % m
If your modulus is prime (you call it p) then you may simply compute:
y = x**(p-2) mod p # Pseudocode
Or in Python proper:
y = pow(x, p-2, p)
Here is someone who has implemented some number theory capabilities in Python: http://www.math.umbc.edu/~campbell/Computers/Python/numbthy.html
Here is an example done at the prompt:
m = 1000000007
x = 1234567
y = pow(x,m-2,m)
y
989145189L
x*y
1221166008548163L
x*y % m
1L
You might also want to look at the gmpy module. It is an interface between Python and the GMP multiple-precision library. gmpy provides an invert function that does exactly what you need:
>>> import gmpy
>>> gmpy.invert(1234567, 1000000007)
mpz(989145189)
Updated answer
As noted by #hyh , the gmpy.invert() returns 0 if the inverse does not exist. That matches the behavior of GMP's mpz_invert() function. gmpy.divm(a, b, m) provides a general solution to a=bx (mod m).
>>> gmpy.divm(1, 1234567, 1000000007)
mpz(989145189)
>>> gmpy.divm(1, 0, 5)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ZeroDivisionError: not invertible
>>> gmpy.divm(1, 4, 8)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ZeroDivisionError: not invertible
>>> gmpy.divm(1, 4, 9)
mpz(7)
divm() will return a solution when gcd(b,m) == 1 and raises an exception when the multiplicative inverse does not exist.
Disclaimer: I'm the current maintainer of the gmpy library.
Updated answer 2
gmpy2 now properly raises an exception when the inverse does not exists:
>>> import gmpy2
>>> gmpy2.invert(0,5)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ZeroDivisionError: invert() no inverse exists
As of 3.8 pythons pow() function can take a modulus and a negative integer. See here. Their case for how to use it is
>>> pow(38, -1, 97)
23
>>> 23 * 38 % 97 == 1
True
Here is a one-liner for CodeFights; it is one of the shortest solutions:
MMI = lambda A, n,s=1,t=0,N=0: (n < 2 and t%N or MMI(n, A%n, t, s-A//n*t, N or n),-1)[n<1]
It will return -1 if A has no multiplicative inverse in n.
Usage:
MMI(23, 99) # returns 56
MMI(18, 24) # return -1
The solution uses the Extended Euclidean Algorithm.
Sympy, a python module for symbolic mathematics, has a built-in modular inverse function if you don't want to implement your own (or if you're using Sympy already):
from sympy import mod_inverse
mod_inverse(11, 35) # returns 16
mod_inverse(15, 35) # raises ValueError: 'inverse of 15 (mod 35) does not exist'
This doesn't seem to be documented on the Sympy website, but here's the docstring: Sympy mod_inverse docstring on Github
Here is a concise 1-liner that does it, without using any external libraries.
# Given 0<a<b, returns the unique c such that 0<c<b and a*c == gcd(a,b) (mod b).
# In particular, if a,b are relatively prime, returns the inverse of a modulo b.
def invmod(a,b): return 0 if a==0 else 1 if b%a==0 else b - invmod(b%a,a)*b//a
Note that this is really just egcd, streamlined to return only the single coefficient of interest.
I try different solutions from this thread and in the end I use this one:
def egcd(a, b):
lastremainder, remainder = abs(a), abs(b)
x, lastx, y, lasty = 0, 1, 1, 0
while remainder:
lastremainder, (quotient, remainder) = remainder, divmod(lastremainder, remainder)
x, lastx = lastx - quotient*x, x
y, lasty = lasty - quotient*y, y
return lastremainder, lastx * (-1 if a < 0 else 1), lasty * (-1 if b < 0 else 1)
def modinv(a, m):
g, x, y = self.egcd(a, m)
if g != 1:
raise ValueError('modinv for {} does not exist'.format(a))
return x % m
Modular_inverse in Python
Here is my code, it might be sloppy but it seems to work for me anyway.
# a is the number you want the inverse for
# b is the modulus
def mod_inverse(a, b):
r = -1
B = b
A = a
eq_set = []
full_set = []
mod_set = []
#euclid's algorithm
while r!=1 and r!=0:
r = b%a
q = b//a
eq_set = [r, b, a, q*-1]
b = a
a = r
full_set.append(eq_set)
for i in range(0, 4):
mod_set.append(full_set[-1][i])
mod_set.insert(2, 1)
counter = 0
#extended euclid's algorithm
for i in range(1, len(full_set)):
if counter%2 == 0:
mod_set[2] = full_set[-1*(i+1)][3]*mod_set[4]+mod_set[2]
mod_set[3] = full_set[-1*(i+1)][1]
elif counter%2 != 0:
mod_set[4] = full_set[-1*(i+1)][3]*mod_set[2]+mod_set[4]
mod_set[1] = full_set[-1*(i+1)][1]
counter += 1
if mod_set[3] == B:
return mod_set[2]%B
return mod_set[4]%B
The code above will not run in python3 and is less efficient compared to the GCD variants. However, this code is very transparent. It triggered me to create a more compact version:
def imod(a, n):
c = 1
while (c % a > 0):
c += n
return c // a
from the cpython implementation source code:
def invmod(a, n):
b, c = 1, 0
while n:
q, r = divmod(a, n)
a, b, c, n = n, c, b - q*c, r
# at this point a is the gcd of the original inputs
if a == 1:
return b
raise ValueError("Not invertible")
according to the comment above this code, it can return small negative values, so you could potentially check if negative and add n when negative before returning b.
To figure out the modular multiplicative inverse I recommend using the Extended Euclidean Algorithm like this:
def multiplicative_inverse(a, b):
origA = a
X = 0
prevX = 1
Y = 1
prevY = 0
while b != 0:
temp = b
quotient = a/b
b = a%b
a = temp
temp = X
a = prevX - quotient * X
prevX = temp
temp = Y
Y = prevY - quotient * Y
prevY = temp
return origA + prevY
Well, here's a function in C which you can easily convert to python. In the below c function extended euclidian algorithm is used to calculate inverse mod.
int imod(int a,int n){
int c,i=1;
while(1){
c = n * i + 1;
if(c%a==0){
c = c/a;
break;
}
i++;
}
return c;}
Translates to Python Function
def imod(a,n):
i=1
while True:
c = n * i + 1;
if(c%a==0):
c = c/a
break;
i = i+1
return c
Reference to the above C function is taken from the following link C program to find Modular Multiplicative Inverse of two Relatively Prime Numbers

hash functions family generator in python

I am looking for a hash functions family generator that could generate a family of hash functions given a set of parameters. I haven't found any such generator so far.
Is there a way to do that with the hashlib package ?
For example I'd like to do something like :
h1 = hash_function(1)
h2 = hash_function(2)
...
and h1 and h2 would be different hash functions.
For those of you who might know about it, I am trying to implement a min-hashing algorithm on a very large dataset.
Basically, I have a very large set of features (100 millions to 1 billion) for a given document, and I need to create 1000 to 10000 different random permutations for this set of features.
I do NOT want to build the random permutations explicitly so the technique I would like to use in the following :
generate a hash function h and consider that for two indices r and s
r appears before s in the permutation if h(r) < h(s) and do that for 100 to 1000 different hash functions.
Are there any known libraries that I might have missed ? Or any standard way of generating families of hash functions with python that you might be aware of ?
I'd just do something like (if you don't need thread-safety -- not hard to alter if you DO need thread safety -- and assuming a 32-bit Python version):
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
As mentioned above, you can use universal hashing for minhash.
For example:
import random
def minhash():
d1 = set(random.randint(0, 2000) for _ in range(1000))
d2 = set(random.randint(0, 2000) for _ in range(1000))
jacc_sim = len(d1.intersection(d2)) / len(d1.union(d2))
print("jaccard similarity: {}".format(jacc_sim))
N_HASHES = 200
hash_funcs = []
for i in range(N_HASHES):
hash_funcs.append(universal_hashing())
m1 = [min([h(e) for e in d1]) for h in hash_funcs]
m2 = [min([h(e) for e in d2]) for h in hash_funcs]
minhash_sim = sum(int(m1[i] == m2[i]) for i in range(N_HASHES)) / N_HASHES
print("min-hash similarity: {}".format(minhash_sim))
def universal_hashing():
def rand_prime():
while True:
p = random.randrange(2 ** 32, 2 ** 34, 2)
if all(p % n != 0 for n in range(3, int((p ** 0.5) + 1), 2)):
return p
m = 2 ** 32 - 1
p = rand_prime()
a = random.randint(0, p)
if a % 2 == 0:
a += 1
b = random.randint(0, p)
def h(x):
return ((a * x + b) % p) % m
return h
Reference
#alex's answer is great and concise, but the hash functions it generates are not "very different from each other".
Let's look at the Pearson correlation between 10000 samples of 10000 hashes that put the results in 100 bins
%%time # 1min 14s
n=10000
hashes = [hash_function(i) for i in range(n)]
median_pvalue(hashes, n=n)
# 1.1614081043690444e-06
I.e. the median p_value is 1e-06 which is far from random. Here's an example if it were truly random :
%%time # 4min 15s
hashes = [lambda _ : random.randint(0,100) for _ in range(n)]
median_pvalue(hashes, n=n)
# 0.4979718236429698
Using Carter and Wegman method you could get:
%%time # 1min 43s
hashes = HashFamily(100).draw_hashes(n)
median_pvalue(hashes, n=n)
# 0.841929288037321
Code to reproduce :
from scipy.stats.stats import pearsonr
import numpy as np
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
class HashFamily():
r"""Universal hash family as proposed by Carter and Wegman.
.. math::
\begin{array}{ll}
h_{{a,b}}(x)=((ax+b)~{\bmod ~}p)~{\bmod ~}m \ \mid p > m\\
\end{array}
Args:
bins (int): Number of bins to hash to. Better if a prime number.
moduler (int,optional): Temporary hashing. Has to be a prime number.
"""
def __init__(self, bins, moduler=None):
if moduler and moduler <= bins:
raise ValueError("p (moduler) should be >> m (buckets)")
self.bins = bins
self.moduler = moduler if moduler else self._next_prime(np.random.randint(self.bins + 1, 2**32))
# do not allow same a and b, as it could mean shifted hashes
self.sampled_a = set()
self.sampled_b = set()
def _is_prime(self, x):
"""Naive is prime test."""
for i in range(2, int(np.sqrt(x))):
if x % i == 0:
return False
return True
def _next_prime(self, n):
"""Naively gets the next prime larger than n."""
while not self._is_prime(n):
n += 1
return n
def draw_hash(self, a=None, b=None):
"""Draws a single hash function from the family."""
if a is None:
while a is None or a in self.sampled_a:
a = np.random.randint(1, self.moduler - 1)
assert len(self.sampled_a) < self.moduler - 2, "please give a bigger moduler"
self.sampled_a.add(a)
if b is None:
while b is None or b in self.sampled_b:
b = np.random.randint(0, self.moduler - 1)
assert len(self.sampled_b) < self.moduler - 1, "please give a bigger moduler"
self.sampled_b.add(b)
return lambda x: ((a * x + b) % self.moduler) % self.bins
def draw_hashes(self, n, **kwargs):
"""Draws n hash function from the family."""
return [self.draw_hash() for i in range(n)]
def median_pvalue(hashes, buckets=100, n=1000):
p_values = []
for j in range(n-1):
a = [hashes[j](i) % buckets for i in range(n)]
b = [hashes[j+1](i) % buckets for i in range(n)]
p_values.append(pearsonr(a,b)[1])
return np.median(p_values)
Note that my implementation is of Carter and Wegman is very naive (e.g. generation of prime numbers). It could be made shorter and quicker.
You should consider using universal hashing. My answer and code can be found here: https://stackoverflow.com/a/25104050/207661
The universal hash family is a set of hash functions H of size m, such that any two (district) inputs collide with probability at most 1/m when the hash function h is drawn randomly from set H.
Based on the formulation in Wikipedia, use can use the following code:
import random
def is_prime(n):
if n==2 or n==3: return True
if n%2==0 or n<2: return False
for i in range(3, int(n**0.5)+1, 2):
if n%i==0:
return False
return True
# universal hash functions
class UniversalHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = min_value_for_prime_number
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p))
def __call__(self, function_index, input_integer):
a, b, p = self.hash_function_attrs[function_index]
return (((a*input_integer + b)%p)%self.number_of_buckets) + self.bucket_value_offset
Example usage:
We can create a hash family consists of 20 hash functions, each one map the input to 100 buckets.
hash_family = UniversalHashFamily(20, 100)
And get the hashed values like:
input_integer = 1234567890 # sample input
hash_family(0, input_integer) # the output of the first hash function, i.e. h0(input_integer)
hash_family(1, input_integer) # the output of the second hash function, i.e. h1(input_integer)
# ...
hash_family(19, input_integer) # the output of the last hash function, i.e. h19(input_integer)
If you are interested in the universal hash family for string inputs, you can use the following code. But please note that this code may not be the optimized solution for string hashing.
class UniversalStringHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = max(min_value_for_prime_number, number_of_buckets)
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
a2 = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p, a2))
def hash_int(self, int_to_hash, a, b, p):
return (((a*int_to_hash + b)%p)%self.number_of_buckets) + self.bucket_value_offset
def hash_str(self, str_to_hash, a, b, p, a2):
str_to_hash = "1" + str_to_hash # this will ensure that universality is not affected, see wikipedia for more detail
l = len(str_to_hash)-1
int_to_hash = 0
for i in range(l+1):
int_to_hash += ord(str_to_hash[i]) * (a2 ** (l-i))
int_to_hash = int_to_hash % p
return self.hash_int(int_to_hash, a, b, p)
def __call__(self, function_index, str_to_hash):
a, b, p, a2 = self.hash_function_attrs[function_index]
return self.hash_str(str_to_hash, a, b, p, a2)

Categories

Resources