Rabin-Karp 2D pattern search runs slower than brute force - python

I have implemented the Rabin-Karp 2D algorithm for pattern searching in python. However, my implementation is slower than the brute force version over a 1000x2000 matrix. Please help me identify the bottle neck. Thanks, and I appreciate your comments.
Note 1: the code is right in finding the position where the pattern matches but runs slower, 1.23s v.s. 0.54s for brute force version on my computer.
Note 2: although one can come up with the worst-case such that Rabin-Karp could be as slow as brute force, the test case given is not designed on purpose to make it O(m(n-m+1)).
Disclaimer : Although this problem is an assignment problem in Algorithms, 4th Edition by Sedgewick and Wayne, it is not my homework. I am learning this algorithm.
Here is the code:
'''
Searches for a 2D pattern in a 2D text. Assumes that both the pattern and the
text are rectangles of characters.
O(Mr * Nr * Nc), where Mr is the pattern row length, Nr is the text row length
and Nc is the text column length
'''
MOD = 10**9+7
class RabinKarp2DV3(object):
def __init__(self, rad, pattern):
#Radix of the alphabet. Assumes ASCII characters
self.RADIX = rad
self.pattern = pattern
self.height = len(pattern)
self.width = len(pattern[0])
self.factors_col = [0]*(self.height)
self.factors_row = [0]*(self.width)
self.factors_col[0] = 1
for i in range(1, len(self.factors_col)):
self.factors_col[i] = (self.RADIX * self.factors_col[i - 1]) % MOD
self.factors_row[0] = 1
for i in range(1, len(self.factors_row)):
self.factors_row[i] = (self.RADIX * self.factors_row[i - 1]) % MOD
hash1d_p = [0]*self.width
self.hash2D(self.pattern, hash1d_p, self.width)
self.patternHash = self.SingleHash(hash1d_p)
def hash2D(self, data, hash1d, hei):
for i in range(hei):
hash1d[i] = 0
for j in range(self.height):
hash1d[i] = (self.RADIX * hash1d[i] + ord(data[j][i])) % MOD
def rehash2D(self, data, hash1d, hei, j):
for i in range(hei):
hash1d[i] = self.RADIX*((hash1d[i] + MOD - self.factors_col[self.height-1]
* ord(data[j][i])%MOD) % MOD) % MOD
hash1d[i] = (hash1d[i] + ord(data[j+self.height][i])) % MOD
def SingleHash(self, hash1d):
res = 0
for i in range(self.width):
res = (self.RADIX * res + hash1d[i]) % MOD
return res
def SingleReHash(self, hash, hash1d, pos):
hash = self.RADIX*((hash + MOD - self.factors_row[self.width-1]*hash1d[pos]%MOD) % MOD) % MOD
hash = (hash + hash1d[pos+self.width]) % MOD
return hash
def check(self, text, i, j):
x, y = i, j
for a in range(self.height):
for b in range(self.width):
if text[x][y] != self.pattern[a][b]:
return False
y += 1
x += 1
y = j
return True
def search(self, text):
hash1d = [0]*len(text[0])
for i in range(len(text)-self.height+1):
if i == 0:
self.hash2D(text, hash1d, len(text[0]))
else:
self.rehash2D(text, hash1d, len(text[0]), i-1)
textHash = 0
for j in range(len(text[0]) - self.width+1):
if j == 0:
textHash = self.SingleHash(hash1d)
else:
textHash = self.SingleReHash(textHash, hash1d, j-1)
#print(i, j, textHash, patternHash)
if textHash == self.patternHash and self.check(text, i, j):
return [i, j]
return None
class BruteForce(object):
def __init__(self, pattern):
self.pattern = pattern
self.height = len(pattern)
self.width = len(pattern[0])
def check(self, text, i, j):
x, y = i, j
for a in range(self.height):
for b in range(self.width):
if text[x][y] != self.pattern[a][b]:
return False
y += 1
x += 1
y = j
return True
def search(self, text):
for i in range(len(text)-self.height+1):
for j in range(len(text[0]) - self.width+1):
if self.check(text, i, j):
return [i, j]
return None
if __name__ == "__main__":
import random
import string
import time
chars = string.ascii_uppercase
im, jm = 1000, 2000
text = []
for i in range(im):
s = ''
for j in range(jm):
s += random.choice(chars)
text.append(s)
pattern = []
for i in range(20):
pattern.append(text[357+i][478:478+40])
start_time = time.time()
matcher = RabinKarp2DV3(256, pattern)
print(matcher.search(text))
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
matcher = BruteForce(pattern)
print(matcher.search(text))
print("--- %s seconds ---" % (time.time() - start_time))

Related

How to use more than 1 cpu core with this code?

I have this code, to convert .VCF files to .GENO files. To test it, I just used the smallest file on my laptop, but for all, I need a bigger machine, and it would be really nice, if it wouldn't take weeks to run. It works perfectly on my laptop (I7 4th gen), but really slow on our workstation server (have 48 cores, but slower). How can I modify the code, to use more cores? Thank you in advance.
The code:
import allel
import pandas as pd
import numpy as np
from time import process_time
import numba as nb
#nb.jit(forceobj=True)
def create_chrpos(data, n):
chr_pos = []
chr_pos = np.array(chr_pos, dtype=np.int32)
for i in range(len(data)):
if data['chr'][i] == n:
if i == 0:
chr_pos = data['pos'][0]
else:
a = data['pos'][i]
chr_pos = np.append(chr_pos, [a])
return chr_pos
#nb.njit
def create_needed_pos(chr_pos, pos):
needed_pos = nb.typed.List.empty_list(nb.int32)
for i in range(len(chr_pos)):
for k in range(len(pos)):
if chr_pos[i] == pos[k]:
if i == k == 1:
needed_pos = nb.typed.List([pos[k]])
else:
needed_pos.append(pos[k])
return needed_pos
#nb.njit
def create_needed_index(chr_pos, pos):
needed_index = nb.typed.List.empty_list(nb.int32)
for i in range(len(chr_pos)):
for k in range(len(pos)):
if chr_pos[i] == pos[k]:
if i == k == 1:
needed_index = nb.typed.List([pos[k]])
else:
needed_index.append(pos[k])
return needed_index
#nb.njit
def create_mat(geno):
# create matrix as np.uint8 (1 byte) instead of list of python integers (8 byte)
# also no need to dynamically resize / increase list size
geno_mat = np.zeros((len(geno[:, 0]), len(geno[1, :])), dtype=np.uint8)
for i in np.arange(len(geno[:, 0])):
for k in np.arange(len(geno[1, :])):
g = geno[i, k]
# nested ifs to avoid duplicate comparisons
if g[0] == 0:
if g[1] == 0:
geno_mat[i, k] = 2
elif g[1] == 1:
geno_mat[i, k] = 1
else:
geno_mat[i, k] = 9
elif g[0] == 1:
if g[1] == 0:
geno_mat[i, k] = 1
elif g[1] == 1:
geno_mat[i, k] = 0
else:
geno_mat[i, k] = 9
else:
geno_mat[i, k] = 9
return geno_mat
def genotyping(geno, pos, chr_pos):
needed_pos = create_needed_pos(chr_pos, pos)
create_needed_index(chr_pos, pos)
mat = create_mat(geno)
list_difference = [item for item in chr_pos if item not in needed_pos]
needed_pos_list = list(needed_pos)
matrix_df = pd.DataFrame(mat, dtype=int, index=pos)
filtered_geno_dataframe = matrix_df.loc[needed_pos_list, :]
missing_positions_df = pd.DataFrame(index=list_difference, columns=np.arange(2054))
missing_positions_df.fillna(2, inplace=True)
finaldataframe = pd.concat([filtered_geno_dataframe, missing_positions_df])
finaldataframe.sort_index(axis=0, inplace=True)
final_mat = finaldataframe.to_numpy(dtype=np.int32)
return final_mat
def write_first_chr(genotype):
with open('test_1.geno', 'wb') as fout: # Note 'wb' instead of 'w'
np.savetxt(fout, genotype, delimiter="", fmt='%d')
fout.seek(-2, 2)
fout.truncate()
def write_remaining_chr(genotype):
with open('test_1.geno', 'a') as fout: # Note 'wb' instead of 'w'
np.savetxt(fout, genotype, delimiter="", fmt='%d')
fout.seek(-2, 2)
fout.truncate()
if __name__ == "__main__":
t1_start = process_time()
data = pd.read_csv('REICH_1KG.snp', delimiter=r"\s+")
data.columns = ['ID', "chr", "pyspos", "pos", "Ref", "Alt"]
samples = open("sample_list_test.txt")
for i, line in enumerate(samples):
strip_line = line.strip()
n = i + 1
chr_pos = create_chrpos(data, n)
geno = allel.read_vcf(strip_line, fields=("calldata/GT",))["calldata/GT"]
pos = allel.read_vcf(strip_line, fields=("variants/POS",))["variants/POS"]
genotype = genotyping(geno, pos, chr_pos)
if i + 1 == 1:
print("First chromosome done")
write_first_chr(genotype)
else:
write_remaining_chr(genotype)
print("Done:Chr number:", n)
print("Finished genotyping")
t1_stop = process_time()
print("Ennyi idő kellett teszt1:", t1_stop - t1_start)

How to avoid out of memory python?

I'm new to python and ubuntu. i got killed after running python code. The file I'm using for the code is around 2.7 GB and I have 16 GB RAM with one tera hard ... what should I do to avoid this problem because I'm searching and found it seems to be out of memory problem
I used this command
free -mh
I got
total used free shared buff/cache available
Mem: 15G 2.5G 9.7G 148M 3.3G 12G
Swap: 4.0G 2.0G 2.0G
the code link I tried Link
import numpy as np
import matplotlib.pyplot as plt
class ProcessData(object):
def data_process(self, folder):
'''
:folder: data file path
:rtype: dict pair distance
MAX id number
'''
distance = dict()
max_pt = 0
with open(folder, 'r') as data:
for line in data:
i, j, dis = line.strip().split()
i, j, dis = int(i), int(j), float(dis)
distance[(i, j)] = dis
distance[(j, i)] = dis
max_pt = max(i, j, max_pt)
for num in range(1, max_pt + 1):
distance[(num, num)] = 0
return distance, max_pt
def entropy(self, distance, maxid, factor):
'''
:distance: dict with pair: dist
:factor: impact factor
:maxid: max elem number
:rtype: entropy H in data field
'''
potential = dict()
for i in range(1, maxid + 1):
tmp = 0
for j in range(1, maxid + 1):
tmp += np.exp(-pow(distance[(i, j)] / factor, 2))
potential[i] = tmp
z = sum(potential.values())
H = 0
for i in range(1, maxid + 1):
x = potential[i] / z
H += x * np.log(x)
return -H
def threshold(self, dist, max_id):
'''
:rtype: factor value makes H smallest
'''
entro = 10.0
# given data:
# 0.02139999999999999 7.203581306901208
# 0.02149999999999999 7.203577254067677
# 0.02159999999999999 7.203577734107922
# generate data:
# 0.367020, 6.943842
# 0.368959, 6.943840
# 0.370898, 6.943841
scape = np.linspace(0.330, 0.430, 50)
# 通用数据使用以下一行
# scape = np.linspace(0.001, 1.001, 100)
for factor in scape:
value = self.entropy(dist, max_id, factor)
print('factor: {0:.6f}, entropy: {1:.8f}'.format(factor, value))
# plt.scatter(factor, value, c='r', s=1)
if value and value < entro:
entro, thresh = value, factor
thresh = 3 * thresh / pow(2, 0.5)
"""
plt.xlabel(r'$\sigma$')
plt.ylabel(r'H')
plt.savefig('./images/Entropy test.png')
plt.close()
"""
print('current: ', entro, thresh)
# given data: 7.203577254067677 0.04560838738653229
# generate data: 6.943840312796875 0.7828967189629044
return thresh
def CutOff(self, distance, max_id, threshold):
'''
:rtype: list with Cut-off kernel values by desc
'''
cut_off = dict()
for i in range(1, max_id + 1):
tmp = 0
for j in range(1, max_id + 1):
gap = distance[(i, j)] - threshold
tmp += 0 if gap >= 0 else 1
cut_off[i] = tmp
sorted_cutoff = sorted(cut_off.items(), key=lambda k:k[1], reverse=True)
return sorted_cutoff
def Guasse(self, distance, max_id, threshold):
'''
:rtype: list with Gaussian kernel values by desc
'''
guasse = dict()
for i in range(1, max_id + 1):
tmp = 0
for j in range(1, max_id + 1):
tmp += np.exp(-pow((distance[(i, j)] / threshold), 2))
guasse[i] = tmp
sorted_guasse = sorted(guasse.items(), key=lambda k:k[1], reverse=True)
return sorted_guasse
def min_distance(self, distance, srt_dens, maxid):
'''
:srt_dens: desc sorted list with density values (point, density)
:rtype: min distance dict
min number dict
'''
min_distance = dict()
min_number = dict()
h_dens = srt_dens[0][0]
min_number[h_dens] = 0
max_dist = -1
for i in range(1, maxid + 1):
max_dist = max(distance[(h_dens, i)], max_dist)
min_distance[h_dens] = max_dist
for j in range(1, len(srt_dens)):
min_dist, min_num = 1, 0
current_num = srt_dens[j][0]
for k in srt_dens[0:j]:
current_dist = distance[(current_num, k[0])]
if current_dist < min_dist:
min_dist, min_num = current_dist, k[0]
min_distance[srt_dens[j][0]] = min_dist
min_number[current_num] = min_num
return min_distance, min_number
def make_pair(self, srt_dens, min_dist, maxid):
'''
:rtype: pair dict with {point: [density, min dist]}
refer factor dict with {point: density * dist}
'''
pair_dict = dict()
dens_dict = dict()
refer_dict = dict()
# convert list to dict
for elem in srt_dens:
dens_dict[elem[0]] = elem[1]
if len(dens_dict) == maxid:
for key in dens_dict.keys():
pair_dict[key] = [dens_dict[key], min_dist[key]]
refer_dict[key] = dens_dict[key] * min_dist[key]
else:
return print('missing %d value', maxid - dens_dict)
return pair_dict, refer_dict
def show_pair_info(self, pair, threshold):
show_dict = dict()
for p in pair.values():
show_dict[p[0]] = p[1]
tmp = sorted(show_dict.items())
dens, mdis = zip(*tmp)
plt.scatter(dens, mdis)
plt.xlabel(r'$\rho$')
plt.ylabel(r'$\delta$')
plt.title(r'$d_c=$' + str(threshold))
plt.savefig('./images/Decision Graph Cutoff test.png')
plt.close()
I tried to figure by using fil-profile and got a problem with line 11 which indicate this data_process
An issue could be f.readlines() as it creates a complete list.
So if COOR_DATA is very large then you should only create memory for one line at a time, so try changing:
with open(COOR_DATA, 'r', encoding='utf-8') as f:
lines = f.readlines()
coords = dict()
for line in lines:
To:
with open(COOR_DATA, 'r', encoding='utf-8') as f:
coords = dict()
for line in f:
See https://docs.python.org/3/tutorial/inputoutput.html#methods-of-file-objects

Trust-Region Dogleg Method for Nonlinear Equations

Hi I am trying to write a trust-region algorithm using the dogleg method with python for a class I have. I have a Newton's Method algorithm and Broyden's Method algorthm that agree with each other but I can't seem to get this Dogleg method to work.
Here is the function I am trying to find the solution to:
def test_function(x):
x1 = float(x[0])
x2 = float(x[1])
r = np.array([[x2**2 - 1],
[np.sin(x1) - x2]])
return r
and here is the jacobian I wrote
def Test_Jacobian(x, size):
e = create_ID_vec(size)
#print(e[0])
epsilon = 10e-8
J = np.zeros([size,size])
#print (J)
for i in range(0, size):
for j in range(0, size):
J[i][j] = ((test_function(x[i]*e[j] + epsilon*e[j])[i] - test_function(x[i]*e[j])[i])/epsilon)
return J
and here is my Trust-Region algorithm:
def Trust_Region(x):
trust_radius = 1
max_trust = 300
eta = rand.uniform(0,.25)
r = test_function(x) # change to correspond with the function you want
J = Test_Jacobian(r, r.size) # change to correspond with function
i = 0
iteration_table = [i]
function_table = [vector_norm(r, r.size)]
while vector_norm(r, r.size) > 10e-10:
print(x, 'at iteration', i, "norm of r is", vector_norm(r, r.size))
p = dogleg(x, r, J, trust_radius)
rho = ratio(x, J, p)
if rho < 0.25:
print('first')
trust_radius = 0.25*vector_norm(p,p.size)
elif rho > 0.75 and vector_norm(p,p.size) == trust_radius:
print('second')
trust_radius = min(2*trust_radius, max_trust)
else:
print('third')
trust_radius = trust_radius
if rho > eta:
print('x changed')
x = x + p
#r = test_function(x)
#J = Test_Jacobian(r, r.size)
else:
print('x did not change')
x = x
r = test_function(x) # change to correspond with the function you want
J = Test_Jacobian(r, r.size) # change to correspond with function
i = i + 1
#print(r)
#print(J)
#print(vector_norm(p,p.size))
print(rho)
#print(trust_radius)
iteration_table.append(i)
function_table.append(vector_norm(r,r.size))
print ('The solution to the non-linear equation is: ', x)
print ('This solution was obtained in ', i, 'iteratations')
plt.figure(figsize=(10,10))
plt.plot(iteration_table, np.log10(function_table))
plt.xlabel('iteration number')
plt.ylabel('function value')
plt.title('Semi-Log Plot for Convergence')
return x, iteration_table, function_table
def dogleg(x, r, J, trust_radius):
tau_k = min(1, vector_norm(J.transpose().dot(r), r.size)**3/(trust_radius*r.transpose().dot(J).dot(J.transpose().dot(J)).dot(J.transpose()).dot(r)))
p_c = -tau_k*(trust_radius/vector_norm(J.transpose().dot(r), r.size))*J.transpose().dot(r)
if vector_norm(p_c, p_c.size) == trust_radius:
print('using p_c')
p_k = p_c
else:
p_j = -np.linalg.inv(J.transpose().dot(J)).dot(J.transpose().dot(r))
print ('using p_j')
tau = tau_finder(x, p_c, p_j, trust_radius, r.size)
p_k = p_c + tau*(p_j-p_c)
return p_k
def ratio(x, J, p):
r = test_function(x)
r_p = test_function(x + p)
print (vector_norm(r, r.size)**2)
print (vector_norm(r_p, r_p.size)**2)
print (vector_norm(r + J.dot(p), r.size)**2)
rho_k =(vector_norm(r, r.size)**2 - vector_norm(r_p, r_p.size)**2)/(vector_norm(r, r.size)**2 - vector_norm(r + J.dot(p), r.size)**2)
return rho_k
def tau_finder(x, p_c, p_j, trust_radius, size):
a = 0
b = 0
c = 0
for i in range(0, size):
a = a + (p_j[i] - p_c[i])**2
b = b + 2*(p_j[i] - p_c[i])*(p_c[i] - x[i])
c = (p_c[i] - x[i])**2
c = c - trust_radius**2
tau_p = (-b + np.sqrt(b**2 - 4*a*c))/(2*a)
tau_m = (-b - np.sqrt(b**2 - 4*a*c))/(2*a)
#print(tau_p)
#print(tau_m)
if tau_p <= 1 and tau_p >=0:
return tau_p
elif tau_m <= 1 and tau_m >=0:
return tau_m
else:
print('error')
return 'error'
def model_function(p):
r = test_function(x)
J = Test_Jacobian(r, r.size)
return 0.5*vector_norm(r + J.dot(p), r.size)**2
The answer should be about [[1.57076525], [1. ]]
but here is the output after about 28-30 iterations:
ZeroDivisionError Traceback (most recent call last)
<ipython-input-359-a414711a1671> in <module>
1 x = create_point(2,1)
----> 2 Trust_Region(x)
<ipython-input-358-7cb77bd44d7b> in Trust_Region(x)
11 print(x, 'at iteration', i, "norm of r is", vector_norm(r, r.size))
12 p = dogleg(x, r, J, trust_radius)
---> 13 rho = ratio(x, J, p)
14
15 if rho < 0.25:
<ipython-input-358-7cb77bd44d7b> in ratio(x, J, p)
71 print (vector_norm(r_p, r_p.size)**2)
72 print (vector_norm(r + J.dot(p), r.size)**2)
---> 73 rho_k =(vector_norm(r, r.size)**2 - vector_norm(r_p, r_p.size)**2)/(vector_norm(r, r.size)**2 - vector_norm(r + J.dot(p), r.size)**2)
74 return rho_k
75
ZeroDivisionError: float division by zero

How to give each Category a color?

We have a code to draw circles on the Location on the map with the name of each category. Now the circles and text are one color. How do we get them in different color's by category? Example: Category Garden: Blue, Category Stone: Grey.
So far the code:
size(1500,800)
background(1)
nofill()
stroke('#f91')
pen(.2)
fill('#f91', 0.05)
rotate(90)
font("Avenir", "bold", 10)
align('left')
def mapValue(value, fromMin, fromMax, toMin, toMax):
# Figure out how 'wide' each range is
fromSpan = fromMax - fromMin
toSpan = toMax - toMin
# Convert the from range into a 0-1 range (float)
valueScaled = float(value - fromMin) / float(fromSpan)
# Convert the 0-1 range into a value in the to range.
return toMin + (valueScaled * toSpan)
def xOfDot(lon):
return mapValue(lon, -100, 100, 0, WIDTH)
def yOfDot(lat):
return mapValue(lat, -90, 90, HEIGHT, 0)
with open('theft-alerts.json', 'r') as inputFile:
data = json.load(inputFile)
print len(data)
artworksPerCity = {}
for stolenArt in data:
if stolenArt.has_key('Category'):
city = stolenArt['Category']
if stolenArt.has_key('nItemsStolen'):
numbersStolen = int(stolenArt['nItemsStolen'])
if artworksPerCity.has_key(city):
# Adjust the value stored for this city
artworksPerCity[city] = artworksPerCity[city] + numbersStolen
else:
# Create new key with new value
artworksPerCity[city] = numbersStolen
# Draw circle on the map
radius = artworksPerCity[city] /2
x = xOfDot(stolenArt['Lon'])
y = yOfDot(stolenArt['Lat'])
arc(x, y, radius)
text(city, x, y)
print artworksPerCity
Here is a sketch of what I intend to include in my pure python data utility.
def hexidecimalDiget(n,deHex = false):
if(n<0):
print "negitive values not supported by call to hexidecimalDiget("+str(n)+")"
return None
elif(n < 10):
return str(n)
elif(n < 15):
return ["a","b","c","d","e"][n-10]
elif(n in ["a","b","c","d","e"]):
if deHex:
return ["a","b","c","d","e"].index(n)
return n
else:
print "call to hexidecimalDiget("+str(n)+") not supported!"
return None
def colorFormHexArray(arr):
if len(arr)!=3 and len(arr)!=6:
print "invalid length for color on call to colorFormHexArray("+str(arr)+")"
return None
elif None in arr:
print "cannot make color from None arguments in "+str(arr)
return None
else:
ret = "#"
for k in arr:
if(type(k) == list):
for k2 in k:
ret+=hexidecimalDiget(k)
else:
ret+=hexidecimalDiget(k)
return ret
def arrayFromColor(c):
c = c.replace("#","")
col = []
for n,k in enumerate(c):
if(len(c) == 3):
col.append([hexidecimalDiget(k,deHex = True)])
elif(len(c) == 6):
col.append([hexidecimalDiget(c[(n+1)*2-2],deHex = True),hexidecimalDiget(c[(n+1)*2-2],deHex = True)])
return(col)
def intFromHexPair(hp):
ret = 0
for n,k in enumerate(hp):
digBase = 16**(len(hp)-n-1)
ret+=digBase*hexidecimalDiget(hp[0],deHex = True)
return ret
def hexPairFromInt(I,minDigits = 1,maxDigits = 256):
if I<0:
print "negitive numbers not supported by hexPairFromInt"
k= 0
while(16**(k+1) <= I):
k+=1
if k < minDigits:
k = minDigits
if k > minDigits:
print("maxDigitsExceeded")
ret = []
while k>=0
dig = 16**k
ret.append(hexidecimalDiget(int(I)%(dig))
I -= dig
k-=1
return ret
def specColor(start,end,bottom,top):
start = arrayFromColor(start)
end = arrayFromColor(end)
def ret(v):
if( v<start or c>end ):
print("value out of range "+str([start,end]))
return('#aa0000') #eyo <- error red
else:
starts = [intFromHexPair(k) for k in start]
ends = [intFromHexPair(hp) for k in end]
normalized = (v-bottom)/(top-bottom)
return colorFormHexArray([hexPairFromInt(int((starts[n]-ends[n])*normalized),minDigits = 1,maxDigits = 256) for n,k in enumerate(starts)])
return ret
This seems excessive and hasn't even been slightly tested yet (just a stetch up atm) but I'll be testing and incorporating this code here tonight :: http://krewn.github.io/KPlot/

SHA-256 implementation in Python

I'm looking for a Python implementation of the SHA-256 hash function. I want to use it to get a better understanding of how the SHA-256 function works, and I think Python is the ideal language for this. Pseudo-code has the limitation that I can't run/test it, to see what my modifications of the code do to the output.
PyPy's source contains a pure-python implementation of SHA-256 here. Poking around in that directory, you'll probably also find pure-python implementations of other standard hashes.
initial_hash_values=[
'6a09e667','bb67ae85','3c6ef372','a54ff53a',
'510e527f','9b05688c','1f83d9ab','5be0cd19'
]
sha_256_constants=[
'428a2f98','71374491','b5c0fbcf','e9b5dba5',
'3956c25b','59f111f1','923f82a4','ab1c5ed5',
'd807aa98','12835b01','243185be','550c7dc3',
'72be5d74','80deb1fe','9bdc06a7','c19bf174',
'e49b69c1','efbe4786','0fc19dc6','240ca1cc',
'2de92c6f','4a7484aa','5cb0a9dc','76f988da',
'983e5152','a831c66d','b00327c8','bf597fc7',
'c6e00bf3','d5a79147','06ca6351','14292967',
'27b70a85','2e1b2138','4d2c6dfc','53380d13',
'650a7354','766a0abb','81c2c92e','92722c85',
'a2bfe8a1','a81a664b','c24b8b70','c76c51a3',
'd192e819','d6990624','f40e3585','106aa070',
'19a4c116','1e376c08','2748774c','34b0bcb5',
'391c0cb3','4ed8aa4a','5b9cca4f','682e6ff3',
'748f82ee','78a5636f','84c87814','8cc70208',
'90befffa','a4506ceb','bef9a3f7','c67178f2'
]
def bin_return(dec):
return(str(format(dec,'b')))
def bin_8bit(dec):
return(str(format(dec,'08b')))
def bin_32bit(dec):
return(str(format(dec,'032b')))
def bin_64bit(dec):
return(str(format(dec,'064b')))
def hex_return(dec):
return(str(format(dec,'x')))
def dec_return_bin(bin_string):
return(int(bin_string,2))
def dec_return_hex(hex_string):
return(int(hex_string,16))
def L_P(SET,n):
to_return=[]
j=0
k=n
while k<len(SET)+1:
to_return.append(SET[j:k])
j=k
k+=n
return(to_return)
def s_l(bit_string):
bit_list=[]
for i in range(len(bit_string)):
bit_list.append(bit_string[i])
return(bit_list)
def l_s(bit_list):
bit_string=''
for i in range(len(bit_list)):
bit_string+=bit_list[i]
return(bit_string)
def rotate_right(bit_string,n):
bit_list = s_l(bit_string)
count=0
while count <= n-1:
list_main=list(bit_list)
var_0=list_main.pop(-1)
list_main=list([var_0]+list_main)
bit_list=list(list_main)
count+=1
return(l_s(list_main))
def shift_right(bit_string,n):
bit_list=s_l(bit_string)
count=0
while count <= n-1:
bit_list.pop(-1)
count+=1
front_append=['0']*n
return(l_s(front_append+bit_list))
def mod_32_addition(input_set):
value=0
for i in range(len(input_set)):
value+=input_set[i]
mod_32 = 4294967296
return(value%mod_32)
def xor_2str(bit_string_1,bit_string_2):
xor_list=[]
for i in range(len(bit_string_1)):
if bit_string_1[i]=='0' and bit_string_2[i]=='0':
xor_list.append('0')
if bit_string_1[i]=='1' and bit_string_2[i]=='1':
xor_list.append('0')
if bit_string_1[i]=='0' and bit_string_2[i]=='1':
xor_list.append('1')
if bit_string_1[i]=='1' and bit_string_2[i]=='0':
xor_list.append('1')
return(l_s(xor_list))
def and_2str(bit_string_1,bit_string_2):
and_list=[]
for i in range(len(bit_string_1)):
if bit_string_1[i]=='1' and bit_string_2[i]=='1':
and_list.append('1')
else:
and_list.append('0')
return(l_s(and_list))
def or_2str(bit_string_1,bit_string_2):
or_list=[]
for i in range(len(bit_string_1)):
if bit_string_1[i]=='0' and bit_string_2[i]=='0':
or_list.append('0')
else:
or_list.append('1')
return(l_s(or_list))
def not_str(bit_string):
not_list=[]
for i in range(len(bit_string)):
if bit_string[i]=='0':
not_list.append('1')
else:
not_list.append('0')
return(l_s(not_list))
'''
SHA-256 Specific Functions:
'''
def Ch(x,y,z):
return(xor_2str(and_2str(x,y),and_2str(not_str(x),z)))
def Maj(x,y,z):
return(xor_2str(xor_2str(and_2str(x,y),and_2str(x,z)),and_2str(y,z)))
def e_0(x):
return(xor_2str(xor_2str(rotate_right(x,2),rotate_right(x,13)),rotate_right(x,22)))
def e_1(x):
return(xor_2str(xor_2str(rotate_right(x,6),rotate_right(x,11)),rotate_right(x,25)))
def s_0(x):
return(xor_2str(xor_2str(rotate_right(x,7),rotate_right(x,18)),shift_right(x,3)))
def s_1(x):
return(xor_2str(xor_2str(rotate_right(x,17),rotate_right(x,19)),shift_right(x,10)))
def message_pad(bit_list):
pad_one = bit_list + '1'
pad_len = len(pad_one)
k=0
while ((pad_len+k)-448)%512 != 0:
k+=1
back_append_0 = '0'*k
back_append_1 = bin_64bit(len(bit_list))
return(pad_one+back_append_0+back_append_1)
def message_bit_return(string_input):
bit_list=[]
for i in range(len(string_input)):
bit_list.append(bin_8bit(ord(string_input[i])))
return(l_s(bit_list))
def message_pre_pro(input_string):
bit_main = message_bit_return(input_string)
return(message_pad(bit_main))
def message_parsing(input_string):
return(L_P(message_pre_pro(input_string),32))
def message_schedule(index,w_t):
new_word = bin_32bit(mod_32_addition([int(s_1(w_t[index-2]),2),int(w_t[index-7],2),int(s_0(w_t[index-15]),2),int(w_t[index-16],2)]))
return(new_word)
'''
This example of SHA_256 works for an input string <56 characters.
'''
def sha_256(input_string):
assert len(input_string) < 56, "This example of SHA_256 works for an input string <56 characters."
w_t=message_parsing(input_string)
a=bin_32bit(dec_return_hex(initial_hash_values[0]))
b=bin_32bit(dec_return_hex(initial_hash_values[1]))
c=bin_32bit(dec_return_hex(initial_hash_values[2]))
d=bin_32bit(dec_return_hex(initial_hash_values[3]))
e=bin_32bit(dec_return_hex(initial_hash_values[4]))
f=bin_32bit(dec_return_hex(initial_hash_values[5]))
g=bin_32bit(dec_return_hex(initial_hash_values[6]))
h=bin_32bit(dec_return_hex(initial_hash_values[7]))
for i in range(0,64):
if i <= 15:
t_1=mod_32_addition([int(h,2),int(e_1(e),2),int(Ch(e,f,g),2),int(sha_256_constants[i],16),int(w_t[i],2)])
t_2=mod_32_addition([int(e_0(a),2),int(Maj(a,b,c),2)])
h=g
g=f
f=e
e=mod_32_addition([int(d,2),t_1])
d=c
c=b
b=a
a=mod_32_addition([t_1,t_2])
a=bin_32bit(a)
e=bin_32bit(e)
if i > 15:
w_t.append(message_schedule(i,w_t))
t_1=mod_32_addition([int(h,2),int(e_1(e),2),int(Ch(e,f,g),2),int(sha_256_constants[i],16),int(w_t[i],2)])
t_2=mod_32_addition([int(e_0(a),2),int(Maj(a,b,c),2)])
h=g
g=f
f=e
e=mod_32_addition([int(d,2),t_1])
d=c
c=b
b=a
a=mod_32_addition([t_1,t_2])
a=bin_32bit(a)
e=bin_32bit(e)
hash_0 = mod_32_addition([dec_return_hex(initial_hash_values[0]),int(a,2)])
hash_1 = mod_32_addition([dec_return_hex(initial_hash_values[1]),int(b,2)])
hash_2 = mod_32_addition([dec_return_hex(initial_hash_values[2]),int(c,2)])
hash_3 = mod_32_addition([dec_return_hex(initial_hash_values[3]),int(d,2)])
hash_4 = mod_32_addition([dec_return_hex(initial_hash_values[4]),int(e,2)])
hash_5 = mod_32_addition([dec_return_hex(initial_hash_values[5]),int(f,2)])
hash_6 = mod_32_addition([dec_return_hex(initial_hash_values[6]),int(g,2)])
hash_7 = mod_32_addition([dec_return_hex(initial_hash_values[7]),int(h,2)])
final_hash = (hex_return(hash_0),
hex_return(hash_1),
hex_return(hash_2),
hex_return(hash_3),
hex_return(hash_4),
hex_return(hash_5),
hex_return(hash_6),
hex_return(hash_7))
return(final_hash)
Some time ago I was also studying SHA-256 and created pure-python class that implements this hash. If I remember correctly, mostly I've taken algorithm from Wikipedia SHA-256 Pseudocode and partially from some open-source projects.
Algorithm doesn't import any (even standard) modules. Of cause it is much slower than hashlib's variant and only meant for studying.
If you just run the script it executes 1000 tests comparing hashlib's and my variants. Only testing function imports some modules, algorithm's class itself doesn't need any modules. Interface is same as in hashlib's sha256 class. See test() function for examples of usage.
Try it online!
class Sha256:
ks = [
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
]
hs = [
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
]
M32 = 0xFFFFFFFF
def __init__(self, m = None):
self.mlen = 0
self.buf = b''
self.k = self.ks[:]
self.h = self.hs[:]
self.fin = False
if m is not None:
self.update(m)
#staticmethod
def pad(mlen):
mdi = mlen & 0x3F
length = (mlen << 3).to_bytes(8, 'big')
padlen = 55 - mdi if mdi < 56 else 119 - mdi
return b'\x80' + b'\x00' * padlen + length
#staticmethod
def ror(x, y):
return ((x >> y) | (x << (32 - y))) & Sha256.M32
#staticmethod
def maj(x, y, z):
return (x & y) ^ (x & z) ^ (y & z)
#staticmethod
def ch(x, y, z):
return (x & y) ^ ((~x) & z)
def compress(self, c):
w = [0] * 64
w[0 : 16] = [int.from_bytes(c[i : i + 4], 'big') for i in range(0, len(c), 4)]
for i in range(16, 64):
s0 = self.ror(w[i - 15], 7) ^ self.ror(w[i - 15], 18) ^ (w[i - 15] >> 3)
s1 = self.ror(w[i - 2], 17) ^ self.ror(w[i - 2], 19) ^ (w[i - 2] >> 10)
w[i] = (w[i - 16] + s0 + w[i - 7] + s1) & self.M32
a, b, c, d, e, f, g, h = self.h
for i in range(64):
s0 = self.ror(a, 2) ^ self.ror(a, 13) ^ self.ror(a, 22)
t2 = s0 + self.maj(a, b, c)
s1 = self.ror(e, 6) ^ self.ror(e, 11) ^ self.ror(e, 25)
t1 = h + s1 + self.ch(e, f, g) + self.k[i] + w[i]
h = g
g = f
f = e
e = (d + t1) & self.M32
d = c
c = b
b = a
a = (t1 + t2) & self.M32
for i, (x, y) in enumerate(zip(self.h, [a, b, c, d, e, f, g, h])):
self.h[i] = (x + y) & self.M32
def update(self, m):
if m is None or len(m) == 0:
return
assert not self.fin, 'Hash already finalized and can not be updated!'
self.mlen += len(m)
m = self.buf + m
for i in range(0, len(m) // 64):
self.compress(m[64 * i : 64 * (i + 1)])
self.buf = m[len(m) - (len(m) % 64):]
def digest(self):
if not self.fin:
self.update(self.pad(self.mlen))
self.digest = b''.join(x.to_bytes(4, 'big') for x in self.h[:8])
self.fin = True
return self.digest
def hexdigest(self):
tab = '0123456789abcdef'
return ''.join(tab[b >> 4] + tab[b & 0xF] for b in self.digest())
def test():
import secrets, hashlib, random
for itest in range(500):
data = secrets.token_bytes(random.randrange(257))
a, b = hashlib.sha256(data).hexdigest(), Sha256(data).hexdigest()
assert a == b, (a, b)
for itest in range(500):
a, b = hashlib.sha256(), Sha256()
for j in range(random.randrange(10)):
data = secrets.token_bytes(random.randrange(129))
a.update(data)
b.update(data)
a, b = a.hexdigest(), b.hexdigest()
assert a == b, (a, b)
print('Sha256 tested successfully.')
if __name__ == '__main__':
test()
If you only want the hash value:
from hashlib import sha256
data = input('Enter plaintext data: ')
output = sha256(data.encode('utf-8'))
print(output)
Python's hashlib also has SHA-1, SHA-384, SHA-512, and MD5 hash functions.
Here is my proposition with redis:
for i in range(len(rserver.keys())):
mdp_hash = rserver.get(rserver.keys()[i])
rserver.set(rserver.keys()[i], hashlib.sha256(mdp_hash.encode()).hexdigest())
Translating http://en.wikipedia.org/wiki/SHA-2#SHA-256_.28a_SHA-2_variant.29_pseudocode to Python should be straight forward.

Categories

Resources