I know there are several similar questions out there, but my question is quite different and difficult for me.
I have two dictionaries:
d1 = {'a': {'b': {'cs': 10}, 'd': {'cs': 20}}}
d2 = {'a': {'b': {'cs': 30}, 'd': {'cs': 20}}, 'newa': {'q': {'cs': 50}}}
i.e. d1 has key 'a', and d2 has keys 'a' and 'newa' (in other words d1 is my old dict and d2 is my new dict).
I want to iterate over these dictionaries such that, if the key is same check for its value (nested dict), e.g. when I find key 'a' in d2, I will check whether there is 'b', if yes check value of 'cs' (changed from 10 to 30), if this value is changed I want to print it.
Another case is, I want to get key 'newa' from d2 as the newly added key.
Hence, after iterating through these 2 dicts, this is the expected output:
"d2" has new key "newa"
Value of "cs" is changed from 10 to 30 of key "b" which is of key "a"
I have the following code with me, I am trying with many loops which are not working though, but is not a good option too, hence I am looking to find whether I can get expected output with a recursive piece of code.
for k, v in d1.iteritems():
for k1, v1 in d2.iteritems():
if k is k1:
print k
for k2 in v:
for k3 in v1:
if k2 is k3:
print k2, "sub key matched"
else:
print "sorry no match found"
comparing 2 dictionaries using recursion:
Edited for python 3 (works for python 2 as well):
d1= {'a':{'b':{'cs':10},'d':{'cs':20}}}
d2= {'a':{'b':{'cs':30} ,'d':{'cs':20}},'newa':{'q':{'cs':50}}}
def findDiff(d1, d2, path=""):
for k in d1:
if k in d2:
if type(d1[k]) is dict:
findDiff(d1[k],d2[k], "%s -> %s" % (path, k) if path else k)
if d1[k] != d2[k]:
result = [ "%s: " % path, " - %s : %s" % (k, d1[k]) , " + %s : %s" % (k, d2[k])]
print("\n".join(result))
else:
print ("%s%s as key not in d2\n" % ("%s: " % path if path else "", k))
print("comparing d1 to d2:")
findDiff(d1,d2)
print("comparing d2 to d1:")
findDiff(d2,d1)
Python 2 old answer:
def findDiff(d1, d2, path=""):
for k in d1:
if (k not in d2):
print (path, ":")
print (k + " as key not in d2", "\n")
else:
if type(d1[k]) is dict:
if path == "":
path = k
else:
path = path + "->" + k
findDiff(d1[k],d2[k], path)
else:
if d1[k] != d2[k]:
print (path, ":")
print (" - ", k," : ", d1[k])
print (" + ", k," : ", d2[k])
Output:
comparing d1 to d2:
a -> b:
- cs : 10
+ cs : 30
comparing d2 to d1:
a -> b:
- cs : 30
+ cs : 10
Modified user3's code to make it even better
d1= {'as': 1, 'a':
{'b':
{'cs':10,
'qqq': {'qwe':1}
},
'd': {'csd':30}
}
}
d2= {'as': 3, 'a':
{'b':
{'cs':30,
'qqq': 123
},
'd':{'csd':20}
},
'newa':
{'q':
{'cs':50}
}
}
def compare_dictionaries(dict_1, dict_2, dict_1_name, dict_2_name, path=""):
"""Compare two dictionaries recursively to find non mathcing elements
Args:
dict_1: dictionary 1
dict_2: dictionary 2
Returns:
"""
err = ''
key_err = ''
value_err = ''
old_path = path
for k in dict_1.keys():
path = old_path + "[%s]" % k
if not dict_2.has_key(k):
key_err += "Key %s%s not in %s\n" % (dict_2_name, path, dict_2_name)
else:
if isinstance(dict_1[k], dict) and isinstance(dict_2[k], dict):
err += compare_dictionaries(dict_1[k],dict_2[k],'d1','d2', path)
else:
if dict_1[k] != dict_2[k]:
value_err += "Value of %s%s (%s) not same as %s%s (%s)\n"\
% (dict_1_name, path, dict_1[k], dict_2_name, path, dict_2[k])
for k in dict_2.keys():
path = old_path + "[%s]" % k
if not dict_1.has_key(k):
key_err += "Key %s%s not in %s\n" % (dict_2_name, path, dict_1_name)
return key_err + value_err + err
a = compare_dictionaries(d1,d2,'d1','d2')
print a
Output:
Key d2[newa] not in d1
Value of d1[as] (1) not same as d2[as] (3)
Value of d1[a][b][cs] (10) not same as d2[a][b][cs] (30)
Value of d1[a][b][qqq] ({'qwe': 1}) not same as d2[a][b][qqq] (123)
Value of d1[a][d][csd] (30) not same as d2[a][d][csd] (20)
why not use deepdiff library.
see it at: https://github.com/seperman/deepdiff
>>> from deepdiff import DeepDiff
>>> t1 = {1:1, 3:3, 4:4}
>>> t2 = {1:1, 3:3, 5:5, 6:6}
>>> ddiff = DeepDiff(t1, t2)
>>> print(ddiff)
{'dictionary_item_added': {'root[5]', 'root[6]'}, 'dictionary_item_removed': {'root[4]'}}
of course it is more powerful, check the doc for more.
This should provide what you need with helpful functions:
For Python 2.7
def isDict(obj):
return obj.__class__.__name__ == 'dict'
def containsKeyRec(vKey, vDict):
for curKey in vDict:
if curKey == vKey or (isDict(vDict[curKey]) and containsKeyRec(vKey, vDict[curKey])):
return True
return False
def getValueRec(vKey, vDict):
for curKey in vDict:
if curKey == vKey:
return vDict[curKey]
elif isDict(vDict[curKey]) and getValueRec(vKey, vDict[curKey]):
return containsKeyRec(vKey, vDict[curKey])
return None
d1= {'a':{'b':{'cs':10},'d':{'cs':20}}}
d2= {'a':{'b':{'cs':30} ,'d':{'cs':20}},'newa':{'q':{'cs':50}}}
for key in d1:
if containsKeyRec(key, d2):
print "dict d2 contains key: " + key
d2Value = getValueRec(key, d2)
if d1[key] == d2Value:
print "values are equal, d1: " + str(d1[key]) + ", d2: " + str(d2Value)
else:
print "values are not equal, d1: " + str(d1[key]) + ", d2: " + str(d2Value)
else:
print "dict d2 does not contain key: " + key
For Python 3 (or higher):
def id_dict(obj):
return obj.__class__.__name__ == 'dict'
def contains_key_rec(v_key, v_dict):
for curKey in v_dict:
if curKey == v_key or (id_dict(v_dict[curKey]) and contains_key_rec(v_key, v_dict[curKey])):
return True
return False
def get_value_rec(v_key, v_dict):
for curKey in v_dict:
if curKey == v_key:
return v_dict[curKey]
elif id_dict(v_dict[curKey]) and get_value_rec(v_key, v_dict[curKey]):
return contains_key_rec(v_key, v_dict[curKey])
return None
d1 = {'a': {'b': {'cs': 10}, 'd': {'cs': 20}}}
d2 = {'a': {'b': {'cs': 30}, 'd': {'cs': 20}}, 'newa': {'q': {'cs': 50}}}
for key in d1:
if contains_key_rec(key, d2):
d2_value = get_value_rec(key, d2)
if d1[key] == d2_value:
print("values are equal, d1: " + str(d1[key]) + ", d2: " + str(d2_value))
pass
else:
print("values are not equal:\n"
"list1: " + str(d1[key]) + "\n" +
"list2: " + str(d2_value))
else:
print("dict d2 does not contain key: " + key)
For python 3 or higher,
Code for comparing any data.
def do_compare(data1, data2, data1_name, data2_name, path=""):
if operator.eq(data1, data2) and not path:
log.info("Both data have same content")
else:
if isinstance(data1, dict) and isinstance(data2, dict):
compare_dict(data1, data2, data1_name, data2_name, path)
elif isinstance(data1, list) and isinstance(data2, list):
compare_list(data1, data2, data1_name, data2_name, path)
else:
if data1 != data2:
value_err = "Value of %s%s (%s) not same as %s%s (%s)\n"\
% (data1_name, path, data1, data2_name, path, data2)
print (value_err)
# findDiff(data1, data2)
def compare_dict(data1, data2, data1_name, data2_name, path):
old_path = path
for k in data1.keys():
path = old_path + "[%s]" % k
if k not in data2:
key_err = "Key %s%s not in %s\n" % (data1_name, path, data2_name)
print (key_err)
else:
do_compare(data1[k], data2[k], data1_name, data2_name, path)
for k in data2.keys():
path = old_path + "[%s]" % k
if k not in data1:
key_err = "Key %s%s not in %s\n" % (data2_name, path, data1_name)
print (key_err)
def compare_list(data1, data2, data1_name, data2_name, path):
data1_length = len(data1)
data2_length = len(data2)
old_path = path
if data1_length != data2_length:
value_err = "No: of items in %s%s (%s) not same as %s%s (%s)\n"\
% (data1_name, path, data1_length, data2_name, path, data2_length)
print (value_err)
for index, item in enumerate(data1):
path = old_path + "[%s]" % index
try:
do_compare(data1[index], data2[index], data1_name, data2_name, path)
except IndexError:
pass
Adding a version that adds some more capabilities:
can compare arbitrarily nested JSON-like dicts and lists
lets you specify keys to ignore (e.g. in flaky unit tests)
lets you specify keys with numerical values that will be treated as equal as long as they fall within a certain percentage of each other
If you define the deep_diff function as below and call it on #rkatkam's example you'll get:
>>> deep_diff(d1, d2)
{'newa': (None, {'q': {'cs': 50}}), 'a': {'b': {'cs': (10, 30)}}}
Here's the function definition:
def deep_diff(x, y, parent_key=None, exclude_keys=[], epsilon_keys=[]):
"""
Take the deep diff of JSON-like dictionaries
No warranties when keys, or values are None
"""
# pylint: disable=unidiomatic-typecheck
EPSILON = 0.5
rho = 1 - EPSILON
if x == y:
return None
if parent_key in epsilon_keys:
xfl, yfl = float_or_None(x), float_or_None(y)
if xfl and yfl and xfl * yfl >= 0 and rho * xfl <= yfl and rho * yfl <= xfl:
return None
if not (isinstance(x, (list, dict)) and (isinstance(x, type(y)) or isinstance(y, type(x)))):
return x, y
if isinstance(x, dict):
d = type(x)() # handles OrderedDict's as well
for k in x.keys() ^ y.keys():
if k in exclude_keys:
continue
if k in x:
d[k] = (deepcopy(x[k]), None)
else:
d[k] = (None, deepcopy(y[k]))
for k in x.keys() & y.keys():
if k in exclude_keys:
continue
next_d = deep_diff(
x[k], y[k], parent_key=k, exclude_keys=exclude_keys, epsilon_keys=epsilon_keys
)
if next_d is None:
continue
d[k] = next_d
return d if d else None
# assume a list:
d = [None] * max(len(x), len(y))
flipped = False
if len(x) > len(y):
flipped = True
x, y = y, x
for i, x_val in enumerate(x):
d[i] = (
deep_diff(
y[i], x_val, parent_key=i, exclude_keys=exclude_keys, epsilon_keys=epsilon_keys
)
if flipped
else deep_diff(
x_val, y[i], parent_key=i, exclude_keys=exclude_keys, epsilon_keys=epsilon_keys
)
)
for i in range(len(x), len(y)):
d[i] = (y[i], None) if flipped else (None, y[i])
return None if all(map(lambda x: x is None, d)) else d
Adding a non-recursive solution.
# Non Recursively traverses through a large nested dictionary
# Uses a queue of dicts_to_process to keep track of what needs to be traversed rather than using recursion.
# Slightly more complex than the recursive version, but arguably better as there is no risk of stack overflow from
# too many levels of recursion
def get_dict_diff_non_recursive(dict1, dict2):
dicts_to_process=[(dict1,dict2,"")]
while dicts_to_process:
d1,d2,current_path = dicts_to_process.pop()
for key in d1.keys():
current_path = os.path.join(current_path, f"{key}")
#print(f"searching path {current_path}")
if key not in d2 or d1[key] != d2[key]:
print(f"difference at {current_path}")
if type(d1[key]) == dict:
dicts_to_process.append((d1[key],d2[key],current_path))
elif type(d1[key]) == list and d1[key] and type(d1[key][0]) == dict:
for i in range(len(d1[key])):
dicts_to_process.append((d1[key][i], d2[key][i],current_path))
I have not liked many of the answers I have found across many threads... A lot of them recommend using deepdiff which is very powerful dont get me wrong but it just does not give me the output I was desiring which is not just a string of the diffs, or a newly built strange-looking dictionary with new keys collected from the nested keys of the original... but actually return a real dictionary with the original keys and delta values.
My use case for this is to send smaller payloads or none if there is no difference over an MQTT network.
The soluton I found is partially stolen from this link, however modified it to just give me the deltas. Then I recursively parse it, calling diff_dict() again if its nested to build the final diff dictionary. It turned out to be much simpler than many examples out there. FYI it does not care about sorting.
My Solution:
def diff_dict(d1, d2):
d1_keys = set(d1.keys())
d2_keys = set(d2.keys())
shared_keys = d1_keys.intersection(d2_keys)
shared_deltas = {o: (d1[o], d2[o]) for o in shared_keys if d1[o] != d2[o]}
added_keys = d2_keys - d1_keys
added_deltas = {o: (None, d2[o]) for o in added_keys}
deltas = {**shared_deltas, **added_deltas}
return parse_deltas(deltas)
def parse_deltas(deltas: dict):
res = {}
for k, v in deltas.items():
if isinstance(v[0], dict):
tmp = diff_dict(v[0], v[1])
if tmp:
res[k] = tmp
else:
res[k] = v[1]
return res
Example:
original = {
'int': 1,
'float': 0.1000,
'string': 'some string',
'bool': True,
'nested1': {
'int': 2,
'float': 0.2000,
'string': 'some string2',
'bool': True,
'nested2': {
'string': 'some string3'
}
}
}
new = {
'int': 2,
'string': 'some string',
'nested1': {
'int': 2,
'float': 0.5000,
'string': 'new string',
'bool': False,
'nested2': {
'string': 'new string nested 2 time'
}
},
'test_added': 'added_val'
}
print(diff_dict(original, new))
Output:
{'int': 2, 'nested1': {'string': 'new string', 'nested2': {'string': 'new string nested 2 time'}, 'bool': False, 'float': 0.5}, 'test_added': 'added_val'}
Solution
def compare_dicts(dict1, dict2, indent=4, level=0, offset=0):
if not (isinstance(dict1, dict) or isinstance(dict2, dict)):
if dict1 == dict2:
return 'OK!'
else:
return 'MISMATCH!'
if level > 0:
print()
keys1 = set(dict1.keys())
keys2 = set(dict2.keys())
if len(keys1 | keys2) == 0:
return '' if level else None
max_len = max(tuple(map(len, keys1 | keys2))) + 2
for key in keys1 & keys2:
print(' '*indent*level + f'{key+":":<{max_len}}', end='')
print(compare_dicts(dict1[key], dict2[key], indent=indent, level=level+1))
for key in keys1 - keys2:
print(' '*indent*level + f'{key+":":<{max_len}}'
+ 'presented only in dict 1!', end='')
for key in keys2 - keys1:
print(' '*indent*level + f'{key+":":<{max_len}}'
+ 'presented only in dict 2!', end='')
return '' if level else None
Example
dict1 = {
'a': 1,
'b': {
'ba': 21,
'bb': 22,
'bc': 23,
},
'c': 3,
'd': 4,
}
dict2 = {
'a': 1,
'b': {
'ba': 21,
'bb': -22,
},
'c': 3,
'd': -4,
'e': 5,
}
compare_dicts(dict1, dict2)
Output
b:
bb: MISMATCH!
ba: OK!
bc: presented only in dict 1!
a: OK!
d: MISMATCH!
c: OK!
e: presented only in dict 2!
I'm trying to pull nested values from a json file. I want to print out each of the values for every "id" key. I think I'm close but can't figure out why the obj type changes from a dict to a list, and then why I'm unable to parse that list.
Here is a link to the json I'm working with: http://hastebin.com/ratevimixa.tex
and here is my current code:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import json
json_data = open('JubJubProductions.json', 'r+')
jdata = json.loads(json_data.read().decode("utf-8"))
def recursion(dict):
for key, value in dict.items():
if type(value) == type(dict):
if key != "paging":
for key, value in value.items():
if isinstance (value,list):
print key
# place where I need to enter list comprehension?
if type(value) == type(dict):
if key == "id":
print " id found " + value
if key != "id":
print key + " 1st level"
if key == "id":
print key
else:
if key == "id":
print "id found " + value
if __name__ == '__main__':
recursion(jdata)
-------------------------------------------------------------------------------------------update
This is now what I'm working with and it'll return a single id value, but not all of them:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import json
json_data = open('jubjubProductions', 'r+')
jdata = json.loads(json_data.read().decode("utf-8"))
def id_generator(d):
for k, v in d.items():
if k == "id":
yield v
elif isinstance(v, dict):
for id_val in id_generator(v):
yield id_val
if __name__ == '__main__':
for _ in id_generator(jdata):
print (_)
The JSON might contain a list of objects, which needs to be searched:
Python 2.7 version:
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for k, v in json_input.iteritems():
if k == lookup_key:
yield v
else:
for child_val in item_generator(v, lookup_key):
yield child_val
elif isinstance(json_input, list):
for item in json_input:
for item_val in item_generator(item, lookup_key):
yield item_val
Python 3.x version:
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for k, v in json_input.items():
if k == lookup_key:
yield v
else:
yield from item_generator(v, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
def id_generator(dict_var):
for k, v in dict_var.items():
if k == "id":
yield v
elif isinstance(v, dict):
for id_val in id_generator(v):
yield id_val
This will create an iterator which will yield every value on any level under key "id". Example usage (printing all of those values):
for _ in id_generator(some_json_dict):
print(_)
A little bit cleaner code (in python 3.x).
def parse_json_recursively(json_object, target_key):
if type(json_object) is dict and json_object:
for key in json_object:
if key == target_key:
print("{}: {}".format(target_key, json_object[key]))
parse_json_recursively(json_object[key], target_key)
elif type(json_object) is list and json_object:
for item in json_object:
parse_json_recursively(item, target_key)
json_object = {"key1": "val1", "key2": [{"key3":"val3", "key4": "val4"}, 123, "abc"]}
target_key = "key3"
parse_json_recursively(json_object, target_key) # Ouput key3: val3
Here is a simple recursive function to collect all values from a json document for a given key. Values can be json documents as well. The corresponding values appended to search_result.
def json_full_search(lookup_key, json_dict, search_result = []):
if type(json_dict) == dict:
for key, value in json_dict.items():
if key == lookup_key:
search_result.append(value)
json_full_search(lookup_key, value, search_result)
elif type(json_dict) == list:
for element in json_dict:
json_full_search(lookup_key, element, search_result)
return search_result
def get_val(j, s, v=None):
for k in j:
if v == None and k == s:
return j[k]
elif v != None and k == s and v == j[k]:
return True
elif v != None and k == s and v != j[k]:
return False
elif isinstance(j[k], dict):
return get_val(j[k], s, v)
You can use with for a json list l below,
for l in j:
r = get_val(l, 'key')
print(r)
for l in j:
r = get_val(l, 'mac', '00-60-2F-5A-04-51')
print(r)
Extension to python 3.x answer:
If nested json has similar keys under different list or dictionaries and you want to take first value of it..
below is the generic way:
def get_value_from_generator(json_input, lookup_key):
value = list(item_generator(json_input, lookup_key))
val = value[0] if value else None
print(f'lookup_key -> value : {val}')
return val
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for k, v in json_input.items():
print(f'{k} -- {v}')
if k == lookup_key:
yield v
else:
yield from item_generator(v, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
How can I test if two dictionaries are equal while taking some keys out of consideration. For example,
equal_dicts(
{'foo':1, 'bar':2, 'x':55, 'y': 77 },
{'foo':1, 'bar':2, 'x':66, 'z': 88 },
ignore_keys=('x', 'y', 'z')
)
should return True.
UPD: I'm looking for an efficient, fast solution.
UPD2. I ended up with this code, which appears to be the fastest:
def equal_dicts_1(a, b, ignore_keys):
ka = set(a).difference(ignore_keys)
kb = set(b).difference(ignore_keys)
return ka == kb and all(a[k] == b[k] for k in ka)
Timings: https://gist.github.com/2651872
def equal_dicts(d1, d2, ignore_keys):
d1_filtered = {k:v for k,v in d1.items() if k not in ignore_keys}
d2_filtered = {k:v for k,v in d2.items() if k not in ignore_keys}
return d1_filtered == d2_filtered
EDIT: This might be faster and more memory-efficient:
def equal_dicts(d1, d2, ignore_keys):
ignored = set(ignore_keys)
for k1, v1 in d1.iteritems():
if k1 not in ignored and (k1 not in d2 or d2[k1] != v1):
return False
for k2, v2 in d2.iteritems():
if k2 not in ignored and k2 not in d1:
return False
return True
Using dict comprehensions:
>>> {k: v for k,v in d1.items() if k not in ignore_keys} == \
... {k: v for k,v in d2.items() if k not in ignore_keys}
Use .viewitems() instead on Python 2.
Here's another variant:
set(ignore_keys).issuperset(k for (k, v) in d1.items() ^ d2.items())
Its virtues:
C speed identification of differences between the dicts
C speed check for membership in the set of ignored keys
Early-out if a single mismatch is found
Very very crudely, you could just delete any ignored keys and compare those dictionaries:
def equal_dicts(d1, d2, ignore_keys=()):
d1_, d2_ = d1.copy(), d2.copy()
for k in ignore_keys:
try:
del d1_[k]
except KeyError:
pass
try:
del d2_[k]
except KeyError:
pass
return d1_ == d2_
(Note that we don't need a deep copy here, we just need to avoid modifying d1 and d2.)
def compare_dict(d1, d2, ignore):
for k in d1:
if k in ignore:
continue
try:
if d1[k] != d2[k]:
return False
except KeyError:
return False
return True
Comment edit: You can do something like compare_dict(d1, d2, ignore) and compare_dict(d2, d1, ignore) or duplicate the for
def compare_dict(d1, d2, ignore):
ignore = set(ignore)
for k in d1:
if k in ignore:
continue
try:
if d1[k] != d2[k]:
return False
except KeyError:
return False
for k in d2:
if k in ignore:
continue
try:
if d1[k] != d2[k]:
return False
except KeyError:
return False
return True
Whatever is faster and cleaner!
Update: cast set(ignore)
If you need this check when testing, you can use the ANY from the unittest.mock library.
Here is an example.
from unittest.mock import ANY
actual = {'userName':'bob', 'lastModified':'2012-01-01'}
expected = {'userName':'bob', 'lastModified': ANY}
assert actual == expected
See more
Optimal solution for the case of ignoring only one key
return all(
(x == y or (x[1] == y[1] == 'key to ignore')) for x, y in itertools.izip(
d1.iteritems(), d2.iteritems()))
in case your dictionary contained lists or other dictionaries:
def equal_dicts(d1, d2, ignore_keys, equal):
# print('got d1', d1)
# print('got d2', d2)
if isinstance(d1, str):
if not isinstance(d2, str):
return False
return d1 == d2
for k in d1:
if k in ignore_keys:
continue
if not isinstance(d1[k], dict) and not isinstance(d1[k], list) and d2.get(k) != d1[k]:
print(k)
equal = False
elif isinstance(d1[k], list):
if not isinstance(d2.get(k), list):
equal = False
if len(d1[k]) != len(d2[k]):
return False
if len(d1[k]) > 0 and isinstance(d1[k][0], dict):
if not isinstance(d2[k][0], dict):
return False
d1_sorted = sorted(d1[k], key=lambda item: item.get('created'))
d2_sorted = sorted(d2[k], key=lambda item: item.get('created'))
equal = all(equal_dicts(x, y, ignore_keys, equal) for x, y in zip(d1_sorted, d2_sorted)) and equal
else:
equal = all(equal_dicts(x, y, ignore_keys, equal) for x, y in zip(d1[k], d2[k])) and equal
elif isinstance(d1[k], dict):
if not isinstance(d2.get(k), dict):
equal = False
print(k)
equal = equal_dicts(d1[k], d2[k], ignore_keys, equal) and equal
return equal
I've been playing around with the Boyer-Moore sting search algorithm and starting with a base code set from Shriphani Palakodety I created 2 additional versions (v2 and v3) - each making some modifications such as removing len() function from the loop and than refactoring the while/if conditions. From v1 to v2 I see about a 10%-15% improvement and from v1 to v3 a 25%-30% improvement (significant).
My question is: does anyone have any additional mods that would improve performance even more (if you can submit as a v4) - keeping the base 'algorithm' true to Boyer-Moore.
#!/usr/bin/env python
import time
bcs = {} #the table
def goodSuffixShift(key):
for i in range(len(key)-1, -1, -1):
if key[i] not in bcs.keys():
bcs[key[i]] = len(key)-i-1
#---------------------- v1 ----------------------
def searchv1(text, key):
"""base from Shriphani Palakodety fixed for single char"""
i = len(key)-1
index = len(key) -1
j = i
while True:
if i < 0:
return j + 1
elif j > len(text):
return "not found"
elif text[j] != key[i] and text[j] not in bcs.keys():
j += len(key)
i = index
elif text[j] != key[i] and text[j] in bcs.keys():
j += bcs[text[j]]
i = index
else:
j -= 1
i -= 1
#---------------------- v2 ----------------------
def searchv2(text, key):
"""removed string len functions from loop"""
len_text = len(text)
len_key = len(key)
i = len_key-1
index = len_key -1
j = i
while True:
if i < 0:
return j + 1
elif j > len_text:
return "not found"
elif text[j] != key[i] and text[j] not in bcs.keys():
j += len_key
i = index
elif text[j] != key[i] and text[j] in bcs.keys():
j += bcs[text[j]]
i = index
else:
j -= 1
i -= 1
#---------------------- v3 ----------------------
def searchv3(text, key):
"""from v2 plus modified 3rd if condition
breaking down the comparison for efficiency,
modified the while loop to include the first
if condition (opposite of it)
"""
len_text = len(text)
len_key = len(key)
i = len_key-1
index = len_key -1
j = i
while i >= 0 and j <= len_text:
if text[j] != key[i]:
if text[j] not in bcs.keys():
j += len_key
i = index
else:
j += bcs[text[j]]
i = index
else:
j -= 1
i -= 1
if j > len_text:
return "not found"
else:
return j + 1
key_list = ["POWER", "HOUSE", "COMP", "SCIENCE", "SHRIPHANI", "BRUAH", "A", "H"]
text = "SHRIPHANI IS A COMPUTER SCIENCE POWERHOUSE"
t1 = time.clock()
for key in key_list:
goodSuffixShift(key)
#print searchv1(text, key)
searchv1(text, key)
bcs = {}
t2 = time.clock()
print('v1 took %0.5f ms' % ((t2-t1)*1000.0))
t1 = time.clock()
for key in key_list:
goodSuffixShift(key)
#print searchv2(text, key)
searchv2(text, key)
bcs = {}
t2 = time.clock()
print('v2 took %0.5f ms' % ((t2-t1)*1000.0))
t1 = time.clock()
for key in key_list:
goodSuffixShift(key)
#print searchv3(text, key)
searchv3(text, key)
bcs = {}
t2 = time.clock()
print('v3 took %0.5f ms' % ((t2-t1)*1000.0))
Using "in bcs.keys()" is creating a list and then doing an O(N) search of the list -- just use "in bcs".
Do the goodSuffixShift(key) thing inside the search function. Two benefits: the caller has only one API to use, and you avoid having bcs as a global (horrid ** 2).
Your indentation is incorrect in several places.
Update
This is not the Boyer-Moore algorithm (which uses TWO lookup tables). It looks more like the Boyer-Moore-Horspool algorithm, which uses only the first BM table.
A probable speedup: add the line 'bcsget = bcs.get' after setting up the bcs dict. Then replace:
if text[j] != key[i]:
if text[j] not in bcs.keys():
j += len_key
i = index
else:
j += bcs[text[j]]
i = index
with:
if text[j] != key[i]:
j += bcsget(text[j], len_key)
i = index
Update 2 -- back to basics, like getting the code correct before you optimise
Version 1 has some bugs which you have carried forward into versions 2 and 3. Some suggestions:
Change the not-found response from "not found" to -1. This makes it compatible with text.find(key), which you can use to check your results.
Get some more text values e.g. "R" * 20, "X" * 20, and "XXXSCIENCEYYY" for use with your existing key values.
Lash up a test harness, something like this:
func_list = [searchv1, searchv2, searchv3]
def test():
for text in text_list:
print '==== text is', repr(text)
for func in func_list:
for key in key_list:
try:
result = func(text, key)
except Exception, e:
print "EXCEPTION: %r expected:%d func:%s key:%r" % (e, expected, func.__name__, key)
continue
expected = text.find(key)
if result != expected:
print "ERROR actual:%d expected:%d func:%s key:%r" % (result, expected, func.__name__, key)
Run that, fix the errors in v1, carry those fixes forward, run the tests again until they're all OK. Then you can tidy up your timing harness along the same lines, and see what the performance is. Then you can report back here, and I'll give you my idea of what a searchv4 function should look like ;-)