from math import sqrt
S1 = [1,0,0,0,1,0,0,2]
S3 = [0,1,1,2,0,1,2,0]
sum = 0
sums1 = 0
sums3 = 0
for i, j in zip(S1,S3):
sums1 += i*i
sums3 += j*j
sum += i*j
cosine_similarity = sum / ((sqrt(sums1)) * (sqrt(sums3)))
print (cosine_similarity)
plz how can I remove this error from code. I want to find cosine similarity of vectors.
The error is due to the indentation level of the last two lines (as mentioned by in the comments by j1-lee):
# ...
sum += i*j
# deindentation
cosine_similarity = sum / ((sqrt(sums1)) * (sqrt(sums3)))
print (cosine_similarity)
Here another implementation by decomposing the definition of cosine similarity into smaller operations:
def scalar_product(a, b):
return sum(a_i*b_i for a_i, b_i in zip(a, b))
def norm(a):
return sum(a_i**2 for a_i in a )**.5
def cosine_similarity(a, b):
return scalar_product(a, b) / (norm(a)*norm(b))
S1 = [1,0,0,0,1,0,0,2]
S3 = [0,1,1,2,0,1,2,0]
cs = cosine_similarity(S1, S3)
print(cs)
# 0.0 # orthogonality
cs = cosine_similarity(S1, S1)
print(cs)
# 1.0...# parallelity
Related
Hello I am working on a problem that seems to be out of my league so any tips, pointers to reading materials etc. are really appreciated. That being said here is the problem:
given 3 subsets of numbers a, b, c ⊆ {0, ..., n}. In nlog(n) check if there exists numbers n1, n2 in a, b and n3 in c where n1 + n2 = n3.
I am given the hint to convert a and b to polynomial coefficients and to use polynomial multiplication using ftt to multiply the coefficients of a and b.
Now where I am stuck is after getting the result of the polynomial multiplication, what do I do next?
Thank you in advanced.
from numpy.fft import fft, ifft
from numpy import real, imag
def polynomial_multiply(a_coeff_list, b_coeff_list):
# Return the coefficient list of the multiplication
# of the two polynomials
# Returned list must be a list of floating point numbers.
# list from complex to reals by using the
# real function in numpy
len_a = len(a_coeff_list)
len_b = len(b_coeff_list)
for i in range(len_a-1):
b_coeff_list.append(0)
for i in range(len_b-1):
a_coeff_list.append(0)
a_fft = fft(a_coeff_list)
b_fft = fft(b_coeff_list)
c = []
for i in range(len(a_fft)):
c.append(a_fft[i] * b_fft[i])
inverse_c = ifft(c)
return real(inverse_c)
# inputs sets a, b, c
# return True if there exist n1 in a, n2 in B such that n1+n2 in C
# return False otherwise
# number n which signifies the maximum number in a, b, c
def check_sum_exists(a, b, c, n):
a_coeffs = [0]*n
b_coeffs = [0]*n
# convert sets a, b into polynomials as provided in the hint
# a_coeffs and b_coeffs should contain the result
i = 0
for item in a:
a_coeffs[i] = item
i += 1
i = 0
for item in b:
b_coeffs[i] = item
i += 1
# multiply them together
c_coeffs = polynomial_multiply(a_coeffs, b_coeffs)
# now this is where i am lost
# how to determine with c_coeffs?
return False
# return True/False
Thanks to all who helped. I figured it out and hopefully this can help anyone who runs into a similar problem. The issue I had was I incorrectly assigned the coefficients for a_coeffs and b_coeffs.
Here is the solution which passed the tests for those interested.
from numpy.fft import fft, ifft
from numpy import real, imag
def check_sum_exists(a, b, c, n):
a_coeffs = [0] * n
b_coeffs = [0] * n
# convert sets a, b into polynomials as provided in the hint
# a_coeffs and b_coeffs should contain the result
for coeff in a:
a_coeffs[coeff] = 1
for coeff in b:
b_coeffs[coeff] = 1
# multiply them together
c_coeffs = polynomial_multiply(a_coeffs, b_coeffs)
# use the result to solve the problem at hand
for coeff in c:
if c_coeffs[coeff] >= .5:
return True
return False
# return True/False
def polynomial_multiply(a_coeff_list, b_coeff_list):
# Return the coefficient list of the multiplication
# of the two polynomials
# Returned list must be a list of floating point numbers.
# Please convert list from complex to reals by using the
# real function in numpy.
for i in range(len(a_coeff_list) - 1):
b_coeff_list.append(0)
for i in range(len(b_coeff_list) - 1):
a_coeff_list.append(0)
a_fft = fft(a_coeff_list)
b_fft = fft(b_coeff_list)
c = []
for i in range(len(a_fft)):
c.append(a_fft[i] * b_fft[i])
return real(ifft(c))
Given some f and the differential equation x'(t) = f(x(t)), how do I compute x(n)(t) in terms of x(t)?
For example, given f(x(t)) = sin(x(t)),
I want to obtain x(3)(t) = (cos(x(t))2 − sin(x(t))2) sin(x(t)).
So far I've tried
>>> from sympy import diff, sin
>>> from sympy.abc import x, t
>>> diff(sin(x(t)), t, 2)
which gives me
-sin(x(t))*Derivative(x(t), t)**2 + cos(x(t))*Derivative(x(t), t, t)
but I'm not sure how to tell SymPy what Derivative(x(t), t) is and have it figure out Derivative(x(t), t, t), etc. automatically.
Answer:
Here's my final solution based on the answers I received below:
def diff(x_derivs_known, t, k, simplify=False):
try: n = len(x_derivs_known)
except TypeError: n = None
if n is None:
result = sympy.diff(x_derivs_known, t, k)
if simplify: result = result.simplify()
elif k < n:
result = x_derivs_known[k]
else:
i = n - 1
result = x_derivs_known[i]
while i < k:
result = result.diff(t)
j = len(x_derivs_known)
x0 = None
while j > 1:
j -= 1
result = result.subs(sympy.Derivative(x_derivs_known[0], t, j), x_derivs_known[j])
i += 1
if simplify: result = result.simplify()
return result
Example:
>>> diff((x(t), sympy.sin(x(t))), t, 3, True)
sin(x(t))*cos(2*x(t))
Here is one approach that returns a list of all derivatives up to n-th order
import sympy as sp
x = sp.Function('x')
t = sp.symbols('t')
f = lambda x: x**2 #sp.exp, sp.sin
n = 4 #3, 4, 5
deriv_list = [x(t), f(x(t))] # list of derivatives [x(t), x'(t), x''(t),...]
for i in range(1,n):
df_i = deriv_list[-1].diff(t).replace(sp.Derivative,lambda *args: f(x(t)))
deriv_list.append(df_i)
print(deriv_list)
[x(t), x(t)**2, 2*x(t)**3, 6*x(t)**4, 24*x(t)**5]
With f=sp.sin it returns
[x(t), sin(x(t)), sin(x(t))*cos(x(t)), -sin(x(t))**3 + sin(x(t))*cos(x(t))**2, -5*sin(x(t))**3*cos(x(t)) + sin(x(t))*cos(x(t))**3]
EDIT: A recursive function for the computation of the n-th derivative:
def der_xt(f, n):
if n==1:
return f(x(t))
else:
return der_xt(f,n-1).diff(t).replace(sp.Derivative,lambda *args: f(x(t)))
print(der_xt(sp.sin,3))
-sin(x(t))**3 + sin(x(t))*cos(x(t))**2
Declare f and use substitution:
>>> f = diff(x(t))
>>> diff(sin(x(t)), t, 2).subs(f, sin(x(t)))
-sin(x(t))**3 + cos(x(t))*Derivative(sin(x(t)), t)
So its and exercises for python i am totally stuck! you have a random Function in [a,b] you already know that the a is negative and b is positive and it has only ONE root. The true root is : -0.94564927392359 and you have to make a
def that will find the root( or zero ) that will be closest to the true root with minimum difference eps.The eps is 1e-8 or 1e-6.Note that we don't know the true root, before was an example to understand what the number we are looking for is about. Also we are given the above :
import math
def fnc(x):
""" This the function in which root we are looking for """
global a, b, eps
if not hasattr(fnc, "counter"):
fnc.counter = 0
fnc.maxtimes = (int)(0.1+math.ceil(math.log((b-a)/eps, 2.0)+2))
if fnc.counter<fnc.maxtimes:
fnc.counter += 1
return x*x*x-x-0.1
else:
return 0.0 ##
WE have to start with this :
def root(f, a, b, eps):
(sorry for my English )
Just simple bisect:
from __future__ import division
import math
def func(x):
return x*x*x-x-0.1
def sign(n):
try:
return n/abs(n)
except ZeroDivisionError:
return 0
def root(f, a, b, eps=1e-6):
f_a = f(a)
if abs(f_a) < eps:
return a
f_b = f(b)
if abs(f_b) < eps:
return b
half = (b+a)/2
f_half = f(half)
if sign(f_half) != sign(f_a):
return root(f, a, half, eps)
else:
return root(f, half, b, eps)
print root(func, -1.5, -0.5, 1e-8) # -0.945649273694
See if the following heuristic of iteratively slicing intervals into 2 equals and then choosing the admissible half is suitable for you.
def root(fnc, a, b, eps = 1e-8, maxtimes = None):
if maxtimes == None: maxtimes = (int)(0.1+math.ceil(math.log((b-a)/eps, 2.0)+2))
for counter in xrange(maxtimes+1) : # a was assumed negative and b positive
if fnc(a) > -eps : return a, -fnc(a)
if fnc(b) < eps : return b, fnc(b)
new_bound = (a + b)/2.0
print a, b, new_bound
if fnc(new_bound) < 0 : a = new_bound
else : b = new_bound
return new_bound, min(-fnc(a),fnc(b))
and then
fnc = lambda x : x**3-x-0.1
result = root(fnc, 0, 2, 1e-6)
print "root = ", result[0], "error = ", result[1]
I want to solve an equation which I am supposed to solve it recursively, I uploaded the picture of formula (Sorry! I did not know how to write mathematical formulas here!)
I wrote the code in Python as below:
import math
alambda = 1.0
rho = 0.8
c = 1.0
b = rho * c / alambda
P0 = (1 - (alambda*b))
P1 = (1-(alambda*b))*(math.exp(alambda*b) - 1)
def a(n):
a_n = math.exp(-alambda*b) * ((alambda*b)**n) / math.factorial(n)
return a_n
def P(n):
P(n) = (P0+P1)*a(n) + sigma(n)
def sigma(n):
j = 2
result = 0
while j <= n+1:
result = result + P(j)*a(n+1-j)
j += 1
return result
It is obvious that I could not finish P function. So please help me with this.
when n=1 I should extract P2, when n=2 I should extract P3.
By the way, P0 and P1 are as written in line 6 and 7.
When I call P(5) I want to see P(0), P(1), P(2), P(3), P(4), P(5), P(6) at the output.
You need to reorganize the formula so that you don't have to calculate P(3) to calculate P(2). This is pretty easy to do, by bringing the last term of the summation, P(n+1)a(0), to the left side of the equation and dividing through by a(0). Then you have a formula for P(n+1) in terms of P(m) where m <= n, which is solvable by recursion.
As Bruce mentions, it's best to cache your intermediate results for P(n) by keeping them in a dict so that a) you don't have to recalculate P(2) etc everytime you need it, and b) after you get the value of P(n), you can just print the dict to see all the values of P(m) where m <= n.
import math
a_lambda = 1.0
rho = 0.8
c = 1.0
b = rho * c / a_lambda
p0 = (1 - (a_lambda*b))
p1 = (1-(a_lambda*b))*(math.exp(a_lambda*b) - 1)
p_dict = {0: p0, 1: p1}
def a(n):
return math.exp(-a_lambda*b) * ((a_lambda*b)**n) / math.factorial(n)
def get_nth_p(n, p_dict):
# return pre-calculated value if p(n) is already known
if n in p_dict:
return p_dict[n]
# Calculate p(n) using modified formula
p_n = ((get_nth_p(n-1, p_dict)
- (get_nth_p(0, p_dict) + get_nth_p(1, p_dict)) * a(n - 1)
- sum(get_nth_p(j, p_dict) * a(n + 1 - j) for j in xrange(2, n)))
/ a(0))
# Save computed value into the dict
p_dict[n] = p_n
return p_n
get_nth_p(6, p_dict)
print p_dict
Edit 2
Some cosmetic updates to the code - shortening the name and making p_dict a mutable default argument (something I try to use only sparingly) really makes the code much more readable:
import math
# Customary to distinguish variables that are unchanging by making them ALLCAP
A_LAMBDA = 1.0
RHO = 0.8
C = 1.0
B = RHO * C / A_LAMBDA
P0 = (1 - (A_LAMBDA*B))
P1 = (1-(A_LAMBDA*B))*(math.exp(A_LAMBDA*B) - 1)
p_value_cache = {0: P0, 1: P1}
def a(n):
return math.exp(-A_LAMBDA*B) * ((A_LAMBDA*B)**n) / math.factorial(n)
def p(n, p_dict=p_value_cache):
# return pre-calculated value if p(n) is already known
if n in p_dict:
return p_dict[n]
# Calculate p(n) using modified formula
p_n = ((p(n-1)
- (p(0) + p(1)) * a(n - 1)
- sum(p(j) * a(n + 1 - j) for j in xrange(2, n)))
/ a(0))
# Save computed value into the dict
p_dict[n] = p_n
return p_n
p(6)
print p_value_cache
You could fix if that way:
import math
alambda = 1.0
rho = 0.8
c = 1.0
b = rho * c / alambda
def a(n):
# you might want to cache a as well
a_n = math.exp(-alambda*b) * ((alambda*b)**n) / math.factorial(n)
return a_n
PCache={0:(1 - (alambda*b)),1:(1-(alambda*b))*(math.exp(alambda*b) - 1)}
def P(n):
if n in PCache:
return PCache[n]
ret= (P(0)+P(1))*a(n) + sigma(n)
PCache[n]=ret
return ret
def sigma(n):
# caching this seems smart as well
j = 2
result = 0
while j <= n+1:
result = result + P(j)*a(n+1-j)
j += 1
return result
void displayP(n):
P(n) # fill cache :-)
for x in range(n):
print ("%u -> %d\n" % (x,PCache[x]))
Instead of managing the cache manually, you might want to use a memoize decorator (see http://www.python-course.eu/python3_memoization.php )
Notes:
not tested, but you should get the idea behind it
your recurrence won't work P(n) depends on P(n+1) on your equation... This will never end
It looks like I misunderstood P0 and P1 as being Both constants (big numbers) and results (small numbers, indices)... Notation is not the best choice I guess...
I am looking for a hash functions family generator that could generate a family of hash functions given a set of parameters. I haven't found any such generator so far.
Is there a way to do that with the hashlib package ?
For example I'd like to do something like :
h1 = hash_function(1)
h2 = hash_function(2)
...
and h1 and h2 would be different hash functions.
For those of you who might know about it, I am trying to implement a min-hashing algorithm on a very large dataset.
Basically, I have a very large set of features (100 millions to 1 billion) for a given document, and I need to create 1000 to 10000 different random permutations for this set of features.
I do NOT want to build the random permutations explicitly so the technique I would like to use in the following :
generate a hash function h and consider that for two indices r and s
r appears before s in the permutation if h(r) < h(s) and do that for 100 to 1000 different hash functions.
Are there any known libraries that I might have missed ? Or any standard way of generating families of hash functions with python that you might be aware of ?
I'd just do something like (if you don't need thread-safety -- not hard to alter if you DO need thread safety -- and assuming a 32-bit Python version):
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
As mentioned above, you can use universal hashing for minhash.
For example:
import random
def minhash():
d1 = set(random.randint(0, 2000) for _ in range(1000))
d2 = set(random.randint(0, 2000) for _ in range(1000))
jacc_sim = len(d1.intersection(d2)) / len(d1.union(d2))
print("jaccard similarity: {}".format(jacc_sim))
N_HASHES = 200
hash_funcs = []
for i in range(N_HASHES):
hash_funcs.append(universal_hashing())
m1 = [min([h(e) for e in d1]) for h in hash_funcs]
m2 = [min([h(e) for e in d2]) for h in hash_funcs]
minhash_sim = sum(int(m1[i] == m2[i]) for i in range(N_HASHES)) / N_HASHES
print("min-hash similarity: {}".format(minhash_sim))
def universal_hashing():
def rand_prime():
while True:
p = random.randrange(2 ** 32, 2 ** 34, 2)
if all(p % n != 0 for n in range(3, int((p ** 0.5) + 1), 2)):
return p
m = 2 ** 32 - 1
p = rand_prime()
a = random.randint(0, p)
if a % 2 == 0:
a += 1
b = random.randint(0, p)
def h(x):
return ((a * x + b) % p) % m
return h
Reference
#alex's answer is great and concise, but the hash functions it generates are not "very different from each other".
Let's look at the Pearson correlation between 10000 samples of 10000 hashes that put the results in 100 bins
%%time # 1min 14s
n=10000
hashes = [hash_function(i) for i in range(n)]
median_pvalue(hashes, n=n)
# 1.1614081043690444e-06
I.e. the median p_value is 1e-06 which is far from random. Here's an example if it were truly random :
%%time # 4min 15s
hashes = [lambda _ : random.randint(0,100) for _ in range(n)]
median_pvalue(hashes, n=n)
# 0.4979718236429698
Using Carter and Wegman method you could get:
%%time # 1min 43s
hashes = HashFamily(100).draw_hashes(n)
median_pvalue(hashes, n=n)
# 0.841929288037321
Code to reproduce :
from scipy.stats.stats import pearsonr
import numpy as np
import random
_memomask = {}
def hash_function(n):
mask = _memomask.get(n)
if mask is None:
random.seed(n)
mask = _memomask[n] = random.getrandbits(32)
def myhash(x):
return hash(x) ^ mask
return myhash
class HashFamily():
r"""Universal hash family as proposed by Carter and Wegman.
.. math::
\begin{array}{ll}
h_{{a,b}}(x)=((ax+b)~{\bmod ~}p)~{\bmod ~}m \ \mid p > m\\
\end{array}
Args:
bins (int): Number of bins to hash to. Better if a prime number.
moduler (int,optional): Temporary hashing. Has to be a prime number.
"""
def __init__(self, bins, moduler=None):
if moduler and moduler <= bins:
raise ValueError("p (moduler) should be >> m (buckets)")
self.bins = bins
self.moduler = moduler if moduler else self._next_prime(np.random.randint(self.bins + 1, 2**32))
# do not allow same a and b, as it could mean shifted hashes
self.sampled_a = set()
self.sampled_b = set()
def _is_prime(self, x):
"""Naive is prime test."""
for i in range(2, int(np.sqrt(x))):
if x % i == 0:
return False
return True
def _next_prime(self, n):
"""Naively gets the next prime larger than n."""
while not self._is_prime(n):
n += 1
return n
def draw_hash(self, a=None, b=None):
"""Draws a single hash function from the family."""
if a is None:
while a is None or a in self.sampled_a:
a = np.random.randint(1, self.moduler - 1)
assert len(self.sampled_a) < self.moduler - 2, "please give a bigger moduler"
self.sampled_a.add(a)
if b is None:
while b is None or b in self.sampled_b:
b = np.random.randint(0, self.moduler - 1)
assert len(self.sampled_b) < self.moduler - 1, "please give a bigger moduler"
self.sampled_b.add(b)
return lambda x: ((a * x + b) % self.moduler) % self.bins
def draw_hashes(self, n, **kwargs):
"""Draws n hash function from the family."""
return [self.draw_hash() for i in range(n)]
def median_pvalue(hashes, buckets=100, n=1000):
p_values = []
for j in range(n-1):
a = [hashes[j](i) % buckets for i in range(n)]
b = [hashes[j+1](i) % buckets for i in range(n)]
p_values.append(pearsonr(a,b)[1])
return np.median(p_values)
Note that my implementation is of Carter and Wegman is very naive (e.g. generation of prime numbers). It could be made shorter and quicker.
You should consider using universal hashing. My answer and code can be found here: https://stackoverflow.com/a/25104050/207661
The universal hash family is a set of hash functions H of size m, such that any two (district) inputs collide with probability at most 1/m when the hash function h is drawn randomly from set H.
Based on the formulation in Wikipedia, use can use the following code:
import random
def is_prime(n):
if n==2 or n==3: return True
if n%2==0 or n<2: return False
for i in range(3, int(n**0.5)+1, 2):
if n%i==0:
return False
return True
# universal hash functions
class UniversalHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = min_value_for_prime_number
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p))
def __call__(self, function_index, input_integer):
a, b, p = self.hash_function_attrs[function_index]
return (((a*input_integer + b)%p)%self.number_of_buckets) + self.bucket_value_offset
Example usage:
We can create a hash family consists of 20 hash functions, each one map the input to 100 buckets.
hash_family = UniversalHashFamily(20, 100)
And get the hashed values like:
input_integer = 1234567890 # sample input
hash_family(0, input_integer) # the output of the first hash function, i.e. h0(input_integer)
hash_family(1, input_integer) # the output of the second hash function, i.e. h1(input_integer)
# ...
hash_family(19, input_integer) # the output of the last hash function, i.e. h19(input_integer)
If you are interested in the universal hash family for string inputs, you can use the following code. But please note that this code may not be the optimized solution for string hashing.
class UniversalStringHashFamily:
def __init__(self, number_of_hash_functions, number_of_buckets, min_value_for_prime_number=2, bucket_value_offset=0):
self.number_of_buckets = number_of_buckets
self.bucket_value_offset = bucket_value_offset
primes = []
number_to_check = max(min_value_for_prime_number, number_of_buckets)
while len(primes) < number_of_hash_functions:
if is_prime(number_to_check):
primes.append(number_to_check)
number_to_check += random.randint(1, 1000)
self.hash_function_attrs = []
for i in range(number_of_hash_functions):
p = primes[i]
a = random.randint(1, p)
a2 = random.randint(1, p)
b = random.randint(0, p)
self.hash_function_attrs.append((a, b, p, a2))
def hash_int(self, int_to_hash, a, b, p):
return (((a*int_to_hash + b)%p)%self.number_of_buckets) + self.bucket_value_offset
def hash_str(self, str_to_hash, a, b, p, a2):
str_to_hash = "1" + str_to_hash # this will ensure that universality is not affected, see wikipedia for more detail
l = len(str_to_hash)-1
int_to_hash = 0
for i in range(l+1):
int_to_hash += ord(str_to_hash[i]) * (a2 ** (l-i))
int_to_hash = int_to_hash % p
return self.hash_int(int_to_hash, a, b, p)
def __call__(self, function_index, str_to_hash):
a, b, p, a2 = self.hash_function_attrs[function_index]
return self.hash_str(str_to_hash, a, b, p, a2)