calculating the Levenshtein Distance

calculating the Levenshtein Distance - python

I have written a function which calculates the Levenshtein distance between two given strings. However, it seems that it is not working correctly. substitution cost = 2, insertion cost = 1, deletion cost = 1
def MyLevenshtein(String1, String2):
if len(String1) and len(String2) != 0:
rows = len(String1) + 1
columns = len(String2) + 1
distance = [[0 for x in range(columns)] for x in range(rows)]
for i in range(1, rows):
distance[i][0] = i
for i in range(1, columns):
distance[0][i] = i
for column in range(1, columns):
for row in range(1, rows):
if String1[row - 1] == String2[column - 1]:
cost = 0
else:
cost = 2
distance[row][column] = min(distance[row - 1][column] + 1, # deletion
distance[row][column - 1] + 1, # insertion
distance[row - 1][column - 1] + cost) #substitution
Distance = distance[row][column]
return Distance
For example, when I call the function with the strings 'hamchenoonan" and 'hamchenin', 5 is returned, although it should return 7.

Here I've seen many implementations:
https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
So I just asked all that worked out of the box for their understanding of costs.
import numpy as np
def Mylevenshtein(String1, String2):
if len(String1) and len(String2) != 0:
rows = len(String1) + 1
columns = len(String2) + 1
distance = [[0 for x in range(columns)] for x in range(rows)]
for i in range(1, rows):
distance[i][0] = i
for i in range(1, columns):
distance[0][i] = i
for column in range(1, columns):
for row in range(1, rows):
if String1[row - 1] == String2[column - 1]:
cost = 0
else:
cost = 2
distance[row][column] = min(distance[row - 1][column] + 1, # deletion
distance[row][column - 1] + 1, # insertion
distance[row - 1][column - 1] + cost) #substitution
Distance = distance[row][column]
return Distance
def levenshtein1(s1, s2):
if len(s1) < len(s2):
return levenshtein1(s2, s1)
# len(s1) >= len(s2)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[
j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def levenshtein2(a, b):
if not a: return len(b)
if not b: return len(a)
return min(levenshtein2(a[1:], b[1:])+(a[0] != b[0]), levenshtein2(a[1:], b)+1, levenshtein2(a, b[1:])+1)
def levenshtein3(s,t):
s = ' ' + s
t = ' ' + t
d = {}
S = len(s)
T = len(t)
for i in range(S):
d[i, 0] = i
for j in range (T):
d[0, j] = j
for j in range(1,T):
for i in range(1,S):
if s[i] == t[j]:
d[i, j] = d[i-1, j-1]
else:
d[i, j] = min(d[i-1, j], d[i, j-1], d[i-1, j-1]) + 1
return d[S-1, T-1]
def levenshtein5(source, target):
if len(source) < len(target):
return levenshtein5(target, source)
# So now we have len(source) >= len(target).
if len(target) == 0:
return len(source)
# We call tuple() to force strings to be used as sequences
# ('c', 'a', 't', 's') - numpy uses them as values by default.
source = np.array(tuple(source))
target = np.array(tuple(target))
# We use a dynamic programming algorithm, but with the
# added optimization that we only need the last two rows
# of the matrix.
previous_row = np.arange(target.size + 1)
for s in source:
# Insertion (target grows longer than source):
current_row = previous_row + 1
# Substitution or matching:
# Target and source items are aligned, and either
# are different (cost of 1), or are the same (cost of 0).
current_row[1:] = np.minimum(
current_row[1:],
np.add(previous_row[:-1], target != s))
# Deletion (target grows shorter than source):
current_row[1:] = np.minimum(
current_row[1:],
current_row[0:-1] + 1)
previous_row = current_row
return previous_row[-1]
def levenshtein6(s, t):
''' From Wikipedia article; Iterative with two matrix rows. '''
if s == t:
return 0
elif len(s) == 0:
return len(t)
elif len(t) == 0:
return len(s)
v0 = [None] * (len(t) + 1)
v1 = [None] * (len(t) + 1)
for i in range(len(v0)):
v0[i] = i
for i in range(len(s)):
v1[0] = i + 1
for j in range(len(t)):
cost = 0 if s[i] == t[j] else 1
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
for j in range(len(v0)):
v0[j] = v1[j]
return v1[len(t)]
for implementation_variant in [g for g in globals() if "leven" in g]:
print("Try variant %s" % implementation_variant)
for a, b in [("hamchenoonan", "hamchenin"),
("Tier", "Tor")]:
print(" -Distance of %s and %s is %i" % (a, b, globals()[implementation_variant](a, b)))
The output shows:
Try variant Mylevenshtein
-Distance of hamchenoonan and hamchenin is 5
-Distance of Tier and Tor is 3
Try variant levenshtein1
-Distance of hamchenoonan and hamchenin is 4
-Distance of Tier and Tor is 2
Try variant levenshtein2
-Distance of hamchenoonan and hamchenin is 4
-Distance of Tier and Tor is 2
Try variant levenshtein3
-Distance of hamchenoonan and hamchenin is 4
-Distance of Tier and Tor is 2
Try variant levenshtein5
-Distance of hamchenoonan and hamchenin is 4
-Distance of Tier and Tor is 2
Try variant levenshtein6
-Distance of hamchenoonan and hamchenin is 4
-Distance of Tier and Tor is 2
The distance of Tier and Tor is mentioned in the german wikipedia, just as a second verification. So the democratic answer seems to be 4.

You code is correct.
The answer is 5 but for different sequence than the comment.
hamchenoonan -> (substitution +2)
^
hamchenionan -> (delete +1)
^
hamcheninan -> (delete +1)
^
hamcheninn -> (delete +1)
^
hamchenin
Plug 1.99 as the substitution cost into your code and it's obvious only one substitution is made.

Related

Write an algorithm for the sequence

Calculate the n member of the sequence given by the formulas
a[2 * n] = a[n] + 1
a[2 * n + 2] = a[2 * n + 1] - a[n]
a[0] = a[1] = 1
n > 0
I've tried a lot of variants, but I can't find correct one.
n = int(input())
a = [0 for i in range(n + 3)]
a[0] = a[1] = 1
i = 1
while i * 2 + 2 < n + 3:
a[2 * i] = a[i] + 1;
a[2 * i + 1] = a[2 * i + 2] + a[i]
a[2 * i + 2] = a[2 * i + 1] - a[i]
i += 1
print(a[n])

We should first compute the expected output for the first few numbers to let us have an idea what the sequence is like first,
a[0] = a[1] = 1
Substitute n = 1 in the first recurrence relation gives
a[2] = a[1] + 1 = 2
Substitute n = 1 in the second recurrence relation gives
a[4] = a[3] - a[1]
But a[4] = a[2] + 1 = 3 according to the first recurrence relation, so 3 = a[3] - 1, which gives a[3] = 4
We have a = {1, 1, 2, 4, 3, ... }
Your program gives a = {1, 1, 2, 1, 3, ...}
What went wrong in your program?
We notice that when i = 1, the line a[2 * i + 1] = a[2 * i + 2] + a[i] evaluates to a[3] = a[4] + a[1]. However, at that time, a[4] is not evaluated yet, causing an incorrect output.
The issue, therefore, lies in how you order your statements in the while loop. Make sure that statements in your loop only make use of values that will not be changed later.
How should we do that?
if we manipulate the second recurrence relation as follows:
a[2 * i + 2] = a[2 * i + 1] - a[i]
a[2 * i + 1] = a[2 * (i + 1)] + a[i]
Using the first recurrence relation, we have
a[2 * i + 1] = a[i + 1] + 1 + a[i]
which should resolve the issue since 2 * n + 1 > n + 1 for all positive n.
After modifying the second statement, you check that every element in a is computed and you should be done.
Note
One more thing to note is that the third statement is redundant since the first statement covers all even elements in a already.
In fact, a more efficient approach, in particular a logarithmic solution exist2 if you only have to calculated the nth member of the sequence.

I found decision
n = int(input())
k = n if n % 2 == 0 else n + 1
a = [None for i in range(k + 1)]
a[0] = a[1] = 1
def fill_list(a):
while None in a:
i = 1
while i * 2 <= k:
if a[i] != None:
a[2 * i] = a[i] + 1
i += 1
i = 1
while i * 2 + 2 <= k:
if a[i * 2 + 2] != None and a[i] != None:
a[i * 2 + 1] = a[i * 2 + 2] + a[i]
i += 1
fill_list(a)
print(a[n])

Your second formula gives a[2n+2] = a[2n+1] - a[n]. That can be rewritten: a[2n+1] = a[2n+2] + a[n] which is a[n+1] + a[n] + 1 from the first formula.
We can use this to write a simple dynamic programming algorithm that runs in linear time:
def A(n):
a = [1] * (n+1)
for i in range(2, n+1):
if i%2 == 0:
a[i] = a[i//2] + 1
else:
a[i] = a[i//2] + a[i//2+1] + 1
return a[n]
However, we can note that we can solve this in logarithmic time, by noting that we can compute both a[n] and a[n+1] from a[n//2] and a[n//2+1].
If n is even, then a[n]=a[n//2]+1 and a[n+1]=a[n//2]+a[n//2+1]+1.
And if n is odd, then a[n]=a[n//2]+a[n//2+1]+1 and a[n+1]=a[n//2+1]+1.
These are just applications of the formulas we have already.
This gives us this solution:
def A2(n):
if n == 0:
return 1, 1
if n == 1:
return 1, 2
a, b = A2(n//2)
if n % 2 == 0:
return a+1, a+b+1
else:
return a+b+1, b+1
Note that this returns 2 values, but for all n, A(n) == A2(n)[0].

How can I store a very little value (approximately 10^(-32)) in numpy array?

I have to store little numbers in a numpy array. At the and, min should be around 10^(-32). It returns with the proper value if it is up to 10^(-14). Is there any way to fix this problem? Thanks for the answers!
import numpy as np
import math
def Modify(array):
maxIndex = np.argmax(array)
for j in range(array.shape[0]):
if (j == maxIndex):
if (j == (len(array)-1)):
nextIndex = (j + 1) % array.shape[0]
prevIndex = (j - 1) % array.shape[0]
array[nextIndex] = (array[prevIndex]) + (array[nextIndex]) - (array[j])
new_array = np.delete(array, [prevIndex, (j % array.shape[0])])
elif (j == 0):
nextIndex = (j + 1) % array.shape[0]
prevIndex = (j - 1) % array.shape[0]
array[prevIndex] = (array[prevIndex]) + (array[nextIndex]) - (array[j])
new_array = np.delete(array, [(j % array.shape[0]), nextIndex])
elif (j != 0 and j % 2 == 0):
nextIndex = (j + 1) % array.shape[0]
prevIndex = (j - 1) % array.shape[0]
array[j] = (array[prevIndex]) + (array[nextIndex]) - (array[j])
new_array = np.delete(array, [prevIndex, nextIndex])
elif(j != (len(array)-1) and j % 2 !=0 ):
nextIndex = (j + 1) % array.shape[0]
prevIndex = (j - 1) % array.shape[0]
array[j] = (array[prevIndex]) + (array[nextIndex]) - (array[j])
new_array = np.delete(array, [prevIndex, nextIndex])
return new_array
if name == "main":
a=10000
i=0
m=0
e = 0
E = 0
k=0
min=1
for i in range(a):
chain = np.random.uniform(0, 1, 256)
chain = chain.astype('float64')
for k in range(len(chain)):
chain[k] = math.log(chain[k])
while (chain.shape[0] > 4):
chain = Modify(chain)
chain[0] = math.exp(chain[0])
chain[1] = math.exp(chain[1])
chain[2] = math.exp(chain[2])
chain[3] = math.exp(chain[3])
chain[2] = chain[0] + chain[2]
E=-math.sqrt((chain[1]**2)+(chain[3]**2)+(chain[2]**2)+2*chain[1]*chain[3])
e=-math.sqrt((chain[1]**2)+(chain[3]**2)+(chain[2]**2)-2*chain[1]*chain[3])
m = abs(e-E)
if m < min:
min=m
print(min)

The underlying problem here is not about storing numbers that small, but rather about the finite precision of floating-point numbers (which is a bit less than 16 decimal digits for 64-bit floats). If the (true) relative difference between e and E is smaller than this precision, e - E will evaluate to zero. This is called "catastrophic cancellation."
A simple and effective way to get around this is to expand e - E in a power series about 2*chain[1]*chain[3] = 0 and take the first non-zero term. If we pull out the common terms between e and E into separate variables, we have
term_1 = chain[1]**2 + chain[3]**2 + chain[2]**2
term_2 = 2*chain[1]*chain[3]
E = -math.sqrt(term_1 + term_2)
e = -math.sqrt(term_1 - term_2)
m = abs(e - E)
The first non-zero term in the series works out to term_2 / sqrt(term_1). The next term is of order (term_2/term_1)^3, so we can safely truncate the series before that if (term_2/term_1)^2 < 1e-16 (16 decimal digits), or term_2 / term_1 < 1e-8. If the ratio is larger than that, our approximation might be slightly off, but at that point the normal e - E works just fine.
The final code (with some extra numpy simplifications):
if __name__ == "__main__":
a = 10000
min = 1
for i in range(a):
chain = np.log(np.random.uniform(0, 1, 256))
while chain.shape[0] > 4:
chain = Modify(chain)
chain = np.exp(chain)
chain[2] = chain[2] + chain[0]
term_1 = chain[1]**2 + chain[3]**2 + chain[2]**2
term_2 = 2 * chain[1] * chain[3]
e = -math.sqrt(term_1 - term_2)
E = -math.sqrt(term_1 + term_2)
m = abs(e - E)
if abs(term_2 / term_1) < 1e-8:
# use power series approximation for e - E instead
# we only need the first term: the next term is O(term_2/term_1)^2
# smaller, which is less than machine epsilon
m = term_2 / math.sqrt(term_1)
if m < min:
min = m
print(min)

As Far From Land as Possible - DP Solution

I was working on a problem from Leetcode "As Far from Land as Possible", which can be found here: https://leetcode.com/problems/as-far-from-land-as-possible/
One solution that is guaranteed to work is to have 4 DP arrays, each of which start from different corners of the grid, and compute the distance to the nearest lands as you head to the opposite corner. In the end, taking the minimum of the elements in all 4 arrays should output the correct solution.
I tried writing a DP solution which would only try to do this one array, computing each by going through the four directions.
My code gives incorrect answers and I can't seem to find where the mistake is.
def maxDistance(self, grid: List[List[int]]) -> int:
N = len(grid)
dpfin = [[float('inf') for k in range(N)] for m in range(N)]
for k in range(N):
for m in range(N):
origk = k
if grid[k][m] == 1:
dpfin[k][m] = 0
elif k == 0 and m == 0:
pass
elif k == 0:
dpfin[k][m] = min(dpfin[k][m], dpfin[k][m-1] + 1)
k = N - 1 - k
dpfin[k][m] = min(dpfin[k][m], dpfin[k][m-1] + 1)
m = N - 1 - m
dpfin[k][m] = min(dpfin[k][m], dpfin[k][m-1] + 1)
k = origk
dpfin[k][m] = min(dpfin[k][m], dpfin[k][m-1] + 1)
elif m == 0:
dpfin[k][m] = min(dpfin[k][m], dpfin[k-1][m] + 1)
k = N - 1 - k
dpfin[k][m] = min(dpfin[k][m], dpfin[k-1][m] + 1)
m = N - 1 - m
dpfin[k][m] = min(dpfin[k][m], dpfin[k-1][m] + 1)
k = origk
dpfin[k][m] = min(dpfin[k][m], dpfin[k-1][m] + 1)
else:
dpfin[k][m] = min( min(dpfin[k-1][m],dpfin[k][m-1])+1,dpfin[k][m])
k = N - 1 - k
dpfin[k][m] = min( min(dpfin[k-1][m],dpfin[k][m-1])+1,dpfin[k][m])
m = N - 1 - m
dpfin[k][m] = min( min(dpfin[k-1][m],dpfin[k][m-1])+1,dpfin[k][m])
k = origk
dpfin[k][m] = min( min(dpfin[k-1][m],dpfin[k][m-1])+1,dpfin[k][m])
maxi = 0
for k in range(N):
for m in range(N):
maxi = max(maxi,dpfin[k][m])
if maxi == float('inf') or maxi == 0:
return -1
return maxi

I have to admit that it was pretty hard, but I think I have a first "unoptimized" solution. First the code, then the explanation:
class Solution:
def _rec_fixDP(self, grid, DP, r, c, max_offset, dist):
if r < 0 or c < 0 or r > max_offset or c > max_offset:
return
if grid[r][c] != 1 and (DP[r][c] == -1 or DP[r][c] > dist):
DP[r][c] = dist
self._rec_fixDP(grid, DP, r - 1, c, max_offset, dist + 1)
self._rec_fixDP(grid, DP, r, c - 1, max_offset, dist + 1)
self._rec_fixDP(grid, DP, r + 1, c, max_offset, dist + 1)
self._rec_fixDP(grid, DP, r, c + 1, max_offset, dist + 1)
def _dp_iteration(self, r, c, grid, DP):
if grid[r][c] == 0:
dp_left = -1 if r - 1 < 0 else DP[r - 1][c]
dp_up = -1 if c - 1 < 0 else DP[r][c - 1]
if dp_left == -1 and dp_up == -1:
DP[r][c] = -1
elif dp_left == -1 and dp_up != -1:
DP[r][c] = dp_up + 1
elif dp_left != -1 and dp_up == -1:
DP[r][c] = dp_left + 1
else:
DP[r][c] = min(dp_left, dp_up) + 1
else:
DP[r][c] = 0
max_offset = max(r, c)
self._rec_fixDP(grid, DP, r - 1, c, max_offset, 1)
self._rec_fixDP(grid, DP, r, c - 1, max_offset, 1)
def maxDistance(self, grid) -> int:
n = len(grid)
DP = [[-1 for i in range(n)] for m in range(n)]
for i in range(n):
r = i
for c in range(i):
self._dp_iteration(r, c, grid, DP)
c = i
for r in range(i + 1):
self._dp_iteration(r, c, grid, DP)
cur_max = -1
for i in DP:
cur_max = max(cur_max, max(i))
print(DP)
return cur_max if cur_max > 0 else -1
sol = Solution()
l = [[0,0,0],[0,0,0],[0,0,1]]
print(sol.maxDistance(l))
I have submitted it to leetcode.com and these are the results:
Runtime: 860 ms, faster than 19.20% of Python3 online submissions for
As Far from Land as Possible.
Memory Usage: 14 MB, less than 100.00%
of Python3 online submissions for As Far from Land as Possible.
Consider this simple grid
1 2 3
4 5 6
7 8 9
I am looping over the grid in this way: 1 4 2 5 7 8 3 6 9. This is because when i visit grid[r][c] I will have already visited grid[r - 1][c] and grid[r][c - 1].
Next, the logic is split in 2 parts: when grid[r][c] = 0 and when grid[r][c] = 1.
For grid[r][c] = 0,
DP[r][c] = min(DP[r - 1][c], DP[r][c - 1]) + 1
The exception is the
value -1: it means it has not been found a land cell yet.
For grid[r][c] = 1,
DP[r][c] = 0
Even if this part is pretty simple (the distance between the land cell
and itself is 0), you also need to fix all the previous calculated
distances, since they could all be -1 or they could have large values.
This last part is executed by _rec_fixDP, called until the current
distance is lesser than the stored one.
The complexity is easy to estimate: you need to loop over all the cell at least one (one in the best case), but the _rec_fixDP could revisit all the previous cells. So:
Best case: O(n^2)
Average case: O(n^4)
However I suspect it is possible to do it in less than O(n^4).

Logic error in my Longest Common Subsequence python

I have implemented solution of Longest Common Subsequence using Dynamic programming in python. For those who don't know LCS here's the link.
https://www.tutorialspoint.com/design_and_analysis_of_algorithms/design_and_analysis_of_algorithms_longest_common_subsequence.htm
My code is not returning the the most optimal answer. What is wrong in my logic ?
import enum
class LCS:
class Dir(enum.Enum):
up = 1
diagonal = 2
left = 3
none = 0
def LCS(self, x, y):
self.DP = {}
m = len(x) - 1
n = len(y) - 1
self.recursion(x, y, m, n)
print(self.DP)
self.printLCS(x, m, n)
def recursion(self, x, y, i, j):
if i == 0 or j == 0:
return [0, self.Dir.none]
else:
if (i, j) not in self.DP:
if x[i] == y[j]:
cost = self.recursion(x, y, i - 1, j - 1)[0] + 1
dir = self.Dir.diagonal
else:
first = self.recursion(x, y, i - 1, j)
second = self.recursion(x, y, i, j - 1)
if first[0] >= second[0]:
cost = first[0]
dir = self.Dir.up
else:
cost = second[0]
dir = self.Dir.left
self.DP[(i, j)] = [cost, dir]
return self.DP[(i, j)]
def printLCS(self, string, i, j):
if i == 0 or j == 0:
return
elif self.DP[(i, j)][1] == self.Dir.diagonal:
self.printLCS(string, i - 1, j - 1)
print(string[i], end="")
elif self.DP[(i, j)][1] == self.Dir.up:
self.printLCS(string, i - 1, j)
else:
self.printLCS(string, i, j - 1)
x = "BDCABA"
y = "ABCBDAB"
sol = LCS()
sol.LCS(x, y)
Expected = "BCBA", Actual = "DAB"

the problem is your base states.
the string in python is 0-base, cause of this the first character of string s is not s[1] its s[0] and you must end your recursion when you reach before first element not at first element.
just replace if i == 0 or j == 0: with if i == -1 or j == -1: in function printLCS and recursion then you will get output BDAB which is the one of correct answers.

Finding null space of binary matrix in python

In factoring methods based on the quadratic sieve, finding the left null space of a binary matrix (values computed mod 2) is a crucial step. (This is also the null space of the transpose.) Does numpy or scipy have tools to do this quickly?
For reference, here is my current code:
# Row-reduce binary matrix
def binary_rr(m):
rows, cols = m.shape
l = 0
for k in range(min(rows, cols)):
print(k)
if l >= cols: break
# Swap with pivot if m[k,l] is 0
if m[k,l] == 0:
found_pivot = False
while not found_pivot:
if l >= cols: break
for i in range(k+1, rows):
if m[i,l]:
m[[i,k]] = m[[k,i]] # Swap rows
found_pivot = True
break
if not found_pivot: l += 1
if l >= cols: break # No more rows
# For rows below pivot, subtract row
for i in range(k+1, rows):
if m[i,l]: m[i] ^= m[k]
l += 1
return m
It is pretty much a straightforward implementation of Gaussian elimination, but since it's written in python it is very slow.

qwr, I found a very fast gaussian elimination routine that finishes so qiuckly that the slow point is the Quadratic Sieving or SIQS Sieving step. The gaussian elimination functions were taken from skollmans factorise.py at https://raw.githubusercontent.com/skollmann/PyFactorise/master/factorise.py
I'll soon be working on a SIQS/GNFS implementation from scratch, and hope to write something super quick for python with multithreading and possiblly cython. In the meantime, if you want something that compiles C (Alpertons ECM Engine) but uses python, you can use: https://github.com/oppressionslayer/primalitytest/ which requires you to cd into calculators directory and run make before importing p2ecm with from sfactorint import p2ecm. With that you can factorise 60 digit numbers in a few seconds.
# Requires sympy and numpy to be installed
# Adjust B and I accordingly. Set for 32 length number
# Usage:
# N=1009732533765251*1896182711927299
# factorise(N, 5000, 25000000) # Takes about 45-60 seconds on a newer computer
# N=1009732533765251*581120948477
# Linear Algebra Step finishes in 1 second, if that
# N=factorise(N, 5000, 2500000) # Takes about 5 seconds on a newer computer
# #Out[1]: 581120948477
import math
import numpy as np
from sympy import isprime
#
# siqs_ functions are the Gaussian Elimination routines right from
# skollmans factorise.py. It is the fastest Gaussian Elimination that i have
# found in python
#
def siqs_factor_from_square(n, square_indices, smooth_relations):
"""Given one of the solutions returned by siqs_solve_matrix_opt,
return the factor f determined by f = gcd(a - b, n), where
a, b are calculated from the solution such that a*a = b*b (mod n).
Return f, a factor of n (possibly a trivial one).
"""
sqrt1, sqrt2 = siqs_calc_sqrts(square_indices, smooth_relations)
assert (sqrt1 * sqrt1) % n == (sqrt2 * sqrt2) % n
return math.gcd(abs(sqrt1 - sqrt2), n)
def siqs_find_factors(n, perfect_squares, smooth_relations):
"""Perform the last step of the Self-Initialising Quadratic Field.
Given the solutions returned by siqs_solve_matrix_opt, attempt to
identify a number of (not necessarily prime) factors of n, and
return them.
"""
factors = []
rem = n
non_prime_factors = set()
prime_factors = set()
for square_indices in perfect_squares:
fact = siqs_factor_from_square(n, square_indices, smooth_relations)
if fact != 1 and fact != rem:
if isprime(fact):
if fact not in prime_factors:
print ("SIQS: Prime factor found: %d" % fact)
prime_factors.add(fact)
while rem % fact == 0:
factors.append(fact)
rem //= fact
if rem == 1:
break
if isprime(rem):
factors.append(rem)
rem = 1
break
else:
if fact not in non_prime_factors:
print ("SIQS: Non-prime factor found: %d" % fact)
non_prime_factors.add(fact)
if rem != 1 and non_prime_factors:
non_prime_factors.add(rem)
for fact in sorted(siqs_find_more_factors_gcd(non_prime_factors)):
while fact != 1 and rem % fact == 0:
print ("SIQS: Prime factor found: %d" % fact)
factors.append(fact)
rem //= fact
if rem == 1 or sfactorint_isprime(rem):
break
if rem != 1:
factors.append(rem)
return factors
def add_column_opt(M_opt, tgt, src):
"""For a matrix produced by siqs_build_matrix_opt, add the column
src to the column target (mod 2).
"""
M_opt[tgt] ^= M_opt[src]
def find_pivot_column_opt(M_opt, j):
"""For a matrix produced by siqs_build_matrix_opt, return the row of
the first non-zero entry in column j, or None if no such row exists.
"""
if M_opt[j] == 0:
return None
return lars_last_powers_of_two_trailing(M_opt[j] + 1)
def siqs_build_matrix_opt(M):
"""Convert the given matrix M of 0s and 1s into a list of numbers m
that correspond to the columns of the matrix.
The j-th number encodes the j-th column of matrix M in binary:
The i-th bit of m[i] is equal to M[i][j].
"""
m = len(M[0])
cols_binary = [""] * m
for mi in M:
for j, mij in enumerate(mi):
cols_binary[j] += "1" if mij else "0"
return [int(cols_bin[::-1], 2) for cols_bin in cols_binary], len(M), m
def siqs_solve_matrix_opt(M_opt, n, m):
"""
Perform the linear algebra step of the SIQS. Perform fast
Gaussian elimination to determine pairs of perfect squares mod n.
Use the optimisations described in [1].
[1] Koç, Çetin K., and Sarath N. Arachchige. 'A Fast Algorithm for
Gaussian Elimination over GF (2) and its Implementation on the
GAPP.' Journal of Parallel and Distributed Computing 13.1
(1991): 118-122.
"""
row_is_marked = [False] * n
pivots = [-1] * m
for j in range(m):
i = find_pivot_column_opt(M_opt, j)
if i is not None:
pivots[j] = i
row_is_marked[i] = True
for k in range(m):
if k != j and (M_opt[k] >> i) & 1: # test M[i][k] == 1
add_column_opt(M_opt, k, j)
perf_squares = []
for i in range(n):
if not row_is_marked[i]:
perfect_sq_indices = [i]
for j in range(m):
if (M_opt[j] >> i) & 1: # test M[i][j] == 1
perfect_sq_indices.append(pivots[j])
perf_squares.append(perfect_sq_indices)
return perf_squares
def sqrt_int(N):
Nsqrt = math.isqrt(N)
assert Nsqrt * Nsqrt == N
return Nsqrt
def siqs_calc_sqrts(square_indices, smooth_relations):
"""Given on of the solutions returned by siqs_solve_matrix_opt and
the corresponding smooth relations, calculate the pair [a, b], such
that a^2 = b^2 (mod n).
"""
res = [1, 1]
for idx in square_indices:
res[0] *= smooth_relations[idx][0]
res[1] *= smooth_relations[idx][1]
res[1] = sqrt_int(res[1])
return res
def quad_residue(a,n):
l=1
q=(n-1)//2
x = q**l
if x==0:
return 1
a =a%n
z=1
while x!= 0:
if x%2==0:
a=(a **2) % n
x//= 2
else:
x-=1
z=(z*a) % n
return z
def STonelli(n, p):
assert quad_residue(n, p) == 1, "not a square (mod p)"
q = p - 1
s = 0
while q % 2 == 0:
q //= 2
s += 1
if s == 1:
r = pow(n, (p + 1) // 4, p)
return r,p-r
for z in range(2, p):
#print(quad_residue(z, p))
if p - 1 == quad_residue(z, p):
break
c = pow(z, q, p)
r = pow(n, (q + 1) // 2, p)
t = pow(n, q, p)
m = s
t2 = 0
while (t - 1) % p != 0:
t2 = (t * t) % p
for i in range(1, m):
if (t2 - 1) % p == 0:
break
t2 = (t2 * t2) % p
b = pow(c, 1 << (m - i - 1), p)
r = (r * b) % p
c = (b * b) % p
t = (t * c) % p
m = i
return (r,p-r)
def build_smooth_relations(smooth_base, root_base):
smooth_relations = []
for xx in range(len(smooth_base)):
smooth_relations.append((root_base[xx], smooth_base[xx], xx))
return smooth_relations
def strailing(N):
return N>>lars_last_powers_of_two_trailing(N)
def lars_last_powers_of_two_trailing(N):
p,y=1,2
orign = N
#if orign < 17: N = N%16
N = N&15
if N == 1:
if ((orign -1) & (orign -2)) == 0: return orign.bit_length()-1
while orign&y == 0:
p+=1
y<<=1
return p
if N in [3, 7, 11, 15]: return 1
if N in [5, 13]: return 2
if N == 9: return 3
return 0
def build_matrix(factor_base, smooth_base):
factor_base = factor_base.copy()
factor_base.insert(0, 2)
sparse_matrix = []
col = 0
for xx in smooth_base:
sparse_matrix.append([])
for fx in factor_base:
count = 0
factor_found = False
while xx % fx == 0:
factor_found = True
xx=xx//fx
count+=1
if count % 2 == 0:
sparse_matrix[col].append(0)
continue
else:
if factor_found == True:
sparse_matrix[col].append(1)
else:
sparse_matrix[col].append(0)
col+=1
return np.transpose(sparse_matrix)
def get_mod_congruence(root, N, withstats=False):
r = root - N
if withstats==True:
print(f"{root} ≡ {r} mod {N}")
return r
def primes_sieve2(limit):
a = np.ones(limit, dtype=bool)
a[0] = a[1] = False
for (i, isprime) in enumerate(a):
if isprime:
yield i
for n in range(i*i, limit, i):
a[n] = False
def remove_singletons(XX):
no_singletons = []
for xx in XX:
if len(xx) != 1:
no_singletons.append(xx)
return no_singletons
def fb_sm(N, B, I):
factor_base, sieve_base, sieve_list, smooth_base, root_base = [], [], [], [], []
primes = list(primes_sieve2(B))
i,root=-1,math.isqrt(N)
for x in primes[1:]:
if quad_residue(N, x) == 1:
factor_base.append(x)
for x in range(I):
xx = get_mod_congruence((root+x)**2, N)
sieve_list.append(xx)
if xx % 2 == 0:
xx = strailing(xx+1) # using lars_last_modulus_powers_of_two(xx) bit trick
sieve_base.append(xx)
for p in factor_base:
residues = STonelli(N, p)
for r in residues:
for i in range((r-root) % p, len(sieve_list), p):
while sieve_base[i] % p == 0:
sieve_base[i] //= p
for o in range(len(sieve_list)):
# This is set to 350, which is only good for numbers
# of len < 32. Modify
# to be more dynamic for larger numbers.
if len(smooth_base) >= 350:
break
if sieve_base[o] == 1:
smooth_base.append(sieve_list[o])
root_base.append(root+o)
return factor_base, smooth_base, root_base
def isSquare(hm):
cr=math.isqrt(hm)
if cr*cr == hm:
return True
return False
def find_square(smooth_base):
for x in smooth_base:
if isSquare(x):
return (True, smooth_base.index(x))
else:
return (False, -1)
t_matrix=[]
primes=list(primes_sieve2(1000000))
def factorise(N, B=10000, I=10000000):
global primes, t_matrix
if isprime(N):
return N
for xx in primes:
if N%xx == 0:
return xx
factor_base, smooth_base, root_base = fb_sm(N,B,I)
issquare, t_matrix = find_square(smooth_base)
if issquare == True:
return math.gcd(math.isqrt(smooth_base[t_matrix])+get_mod_congruence(root_base[t_matrix], N), N)
t_matrix = build_matrix(factor_base, smooth_base)
smooth_relations = build_smooth_relations(smooth_base, root_base)
M_opt, M_n, M_m = siqs_build_matrix_opt(np.transpose(t_matrix))
perfect_squares = remove_singletons(siqs_solve_matrix_opt(M_opt, M_n, M_m))
factors = siqs_find_factors(N, perfect_squares, smooth_relations)
return factors

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

calculating the Levenshtein Distance - python

Related

Write an algorithm for the sequence

How can I store a very little value (approximately 10^(-32)) in numpy array?

As Far From Land as Possible - DP Solution

Logic error in my Longest Common Subsequence python

Finding null space of binary matrix in python

Categories

Resources