Given a paragraph of space-separated lowercase English words and a list of unique lowercase English keywords, find the minimum length of the substring of which contains all the keywords that are separated by space in any order.
i put the following code where is the error ? How can i decrease time complexity.
import sys
def minimumLength(text, keys):
answer = 10000000
text += " $"
for i in xrange(len(text) - 1):
dup = list(keys)
word = ""
if i > 0 and text[i - 1] != ' ':
continue
for j in xrange(i, len(text)):
if text[j] == ' ':
for k in xrange(len(dup)):
if dup[k] == word:
del(dup[k])
break
word = ""
else:
word += text[j]
if not dup:
answer = min(answer, j - i)
break
if(answer == 10000000):
answer = -1
return answer
text = raw_input()
keyWords = int(raw_input())
keys = []
for i in xrange(keyWords):
keys.append(raw_input())
print(minimumLength(text, keys))
The trick is to scan from left to right and, once you find a window containing all the keys, try to reduce it on the left and enlarge it on the right preserving the property that all the terms remain inside the window.
Using this strategy you can solve the task in linear time.
The following code is a draft of the code that I tested on few strings, I hope the comments are enough to highlight the most critical steps:
def minimum_length(text, keys):
assert isinstance(text, str) and (isinstance(keys, set) or len(keys) == len(set(keys)))
minimum_length = None
key_to_occ = dict((k, 0) for k in keys)
text_words = [word if word in key_to_occ else None for word in text.split()]
missing_words = len(keys)
left_pos, last_right_pos = 0, 0
# find an interval with all the keys
for right_pos, right_word in enumerate(text_words):
if right_word is None:
continue
key_to_occ[right_word] += 1
occ_word = key_to_occ[right_word]
if occ_word == 1: # the first time we see this word in the current interval
missing_words -= 1
if missing_words == 0: # we saw all the words in this interval
key_to_occ[right_word] -= 1
last_right_pos = right_pos
break
if missing_words > 0:
return None
# reduce the interval on the left and enlarge it on the right preserving the property that all the keys are inside
for right_pos in xrange(last_right_pos, len(text_words)):
right_word = text_words[right_pos]
if right_word is None:
continue
key_to_occ[right_word] += 1
while left_pos < right_pos: # let's try to reduce the interval on the left
left_word = text_words[left_pos]
if left_word is None:
left_pos += 1
continue
if key_to_occ[left_word] == 1: # reduce the interval only if it doesn't decrease the number of occurrences
interval_size = right_pos + 1 - left_pos
if minimum_length is None or interval_size < minimum_length:
minimum_length = interval_size
break
else:
left_pos += 1
key_to_occ[left_word] -= 1
return minimum_length
Related
I have a sequence print(lcp(["flower","flow","flight", "dog"])) which should return fl. Currently I can get it to return flowfl.
I can locate the instances where o or w should be removed, and tried different approaches to remove them. However they seem to hit syntax issue, which I cannot seem to resolve by myself.
I would very much appreciate a little guidance to either have the tools to remedy this issue my self, or learn from a working proposed solution.
def lcp(strs):
if not isinstance(strs, list) or len(strs) == 0:
return ""
if len(strs) == 1:
return strs[0]
original = strs[0]
original_max = len(original)
result = ""
for _, word in enumerate(strs[1:],1):
current_max = len(word)
i = 0
while i < current_max and i < original_max:
copy = "".join(result)
if len(copy) and copy[i-1] not in word:
# result = result.replace(copy[i-1], "")
# result = copy[:i-1]
print(copy[i-1], copy, result.index(copy[i-1]), i, word)
if word[i] == original[i]:
result += word[i]
i += 1
return result
print(lcp(["flower","flow","flight", "dog"])) # returns flowfl should be fl
print(lcp(["dog","car"])) # works
print(lcp(["dog","racecar","car"])) # works
print(lcp([])) # works
print(lcp(["one"])) # works
I worked on an alternative which does not be solve removing inside the same loop, adding a counter at the end. However my instincts suggest it can be solved within the for and while loops without increasing code bloat.
if len(result) > 1:
counter = {char: result.count(char) for char in result}
print(counter)
I have solved this using the below approach.
class Solution:
def longestCommonPrefix(self, strs: List[str]) -> str:
N = len(strs)
if N == 1:
return strs[0]
len_of_small_str, small_str = self.get_min_str(strs)
ans = ""
for i in range(len_of_small_str):
ch = small_str[i]
is_qualified = True
for j in range(N):
if strs[j][i] != ch:
is_qualified = False
break
if is_qualified:
ans += ch
else:
break
return ans
def get_min_str(self, A):
min_len = len(A[0])
s = A[0]
for i in range(1, len(A)):
if len(A[i]) < min_len:
min_len = len(A[i])
s = A[i]
return min_len, s
Returns the longest prefix that the set of words have in common.
def lcp(strs):
if len(strs) == 0:
return ""
result = strs[0]
for word in strs[1:]:
for i, (l1, l2) in enumerate(zip(result, word)):
if l1 != l2:
result = result[:i]
break
else:
result = result[:i+1]
return result
Results:
>>> print(lcp(["flower","flow","flight"]))
fl
>>> print(lcp(["flower","flow","flight", "dog"]))
>>> print(lcp(["dog","car"]))
>>> print(lcp(["dog","racecar","car"]))
>>> print(lcp([]))
>>> print(lcp(["one"]))
one
>>> print(lcp(["one", "one"]))
one
You might need to rephrase your goal.
By your description you don't want the longest common prefix, but the prefix that the most words have in common with the first one.
One of your issues is that your tests only test one real case and four edgecases. Make some more real examples.
Here's my proposition: I mostly added the elif to check if we already have a difference on the first letter to then discard the entry.
It also overwrites the original to rebuild the string based on the common prefix with the next word (if there are any)
def lcp(strs):
if not isinstance(strs, list) or len(strs) == 0:
return ""
if len(strs) == 1:
return strs[0]
original = strs[0]
result = ""
for word in strs[1:]:
i = 0
while i < len(word) and i < len(original) :
if word[i] == original[i]:
result += word[i]
elif i == 0:
result = original
break
i += 1
original = result
result = ""
return original
print(lcp(["flower","flow","flight", "dog"])) # fl
print(lcp(["shift", "shill", "hunter", "shame"])) # sh
print(lcp(["dog","car"])) # dog
print(lcp(["dog","racecar","car"])) # dog
print(lcp(["dog","racecar","dodge"])) # do
print(lcp([])) # [nothing]
print(lcp(["one"])) # one
https://cs50.harvard.edu/x/2020/psets/6/dna/#:~:text=python%20dna.py%20databases/large.csv%20sequences/5.txt
I'm trying to solve this problem from CS50 but it just works for the small database, when I try it for the large one the program overcounts.
import csv
if len(argv) != 3:
print("DIGITA DIREITO, IMBECIL")
exit()
with open(argv[1], "r") as source:
reader = list(csv.reader(source))
reader[0].remove("name")
i = reader[0]
with open(argv[2], "r") as sequence:
seq = sequence.read()
values = []
for j in range(len(i)):
value = 0
counter = 0
pos = 0
prevpos = 0
while pos < len(seq):
pos = seq.find(i[j], pos)
if pos == -1:
counter = 0
break
elif (pos != 1):
counter += 1
prevpos = pos
pos += len(i[j])
if value < counter:
value = counter
values.append(value)
for row in range(len(reader)):
print(reader[row])
print(values)
values = list(map(str, values))
search = list(reader)
search.pop(0)
for result in search:
if result[1:] == values:
print(f"{result[0]}")
break
elif result == search[-1]:
print("No match")
I think you are just counting the STRs repetitions in the sequence, not the maximum consecutive STR repetitions. This is what the problem asks
I'm trying to create a game where the score is dependent on what the letters are worth. I'm having trouble with keeping a count on the side while still recursing to the next letter of the string. I'm really stuck & I hope you can help!
def net_zero():
guess_prompt = input('Guess a string: ')
win_display = 'Congratulations you win'
low_vowels = "aeiou" # +1
low_constants = "bcdfghjklmnpqrstvwxyz" # -1
up_vowels = "AEIOU" # +2
up_constants = "BCDFGHJKLMNPQRSTVWXYZ" # -2
ten_digits = "0123456789" # +3
#else -3
count = 0
if len(guess_prompt) == 0:
return count
elif guess_prompt[0] in low_vowels:
return (count + 1) + guess_prompt[1:]
elif guess_prompt[0] in low_constants:
return (count - 1) + guess_prompt[1:]
elif guess_prompt[0] in up_vowels:
return (count + 2) + guess_prompt[1:]
elif guess_prompt[0] in up_constants:
return (count - 2) + guess_prompt[1:]
elif guess_prompt[0] in ten_digits:
return (count + 3) + guess_prompt[1:]
else: return (count - 3) + guess_prompt[1:]
I think you would like to do following
count = 0
if len(guess_prompt) == 0:
return count
for letter in guess_prompt:
if letter in low_vowels:
count +=1
if letter in low_constants:
count -=1
...
return count
I feel you can use dict instead of using string content for lookup. It will improve lookup time.
guess_prompt = "aaB4??BBBBB"
value = {}
for char in "aeiou":
value[char] = 1
for char in "bcdfghjklmnpqrstvwxyz":
value[char] = -1
for char in "AEIOU":
value[char] = 2
for char in "BCDFGHJKLMNPQRSTVWXYZ":
value[char] = -2
for char in "0123456789":
value[char] = 3
count = 0
for char in guess_prompt:
count = count + value.get(char, -3) #default value -3
print(count) ## PRINTS -13 ##
I am asked to binary search a list of names and if these names start with a particular letter, for example A, then I am to print that name.
I can complete this task by doing much more simple code such as
for i in list:
if i[0] == "A":
print(i)
but instead I am asked to use a binary search and I'm struggling to understand the process behind it. We are given base code which can output the position a given string. My problem is not knowing what to edit so that I can achieve the desired outcome
name_list = ["Adolphus of Helborne", "Aldric Foxe", "Amanita Maleficant", "Aphra the Vicious", "Arachne the Gruesome", "Astarte Hellebore", "Brutus the Gruesome", "Cain of Avernus"]
def bin_search(list, item):
low_b = 0
up_b = len(list) - 1
found = False
while low_b <= up_b and found == False:
midPos = ((low_b + up_b) // 2)
if list[midPos] < item:
low_b = midPos + 1
elif list[midPos] > item:
up_b = midPos - 1
else:
found = True
if found:
print("The name is at positon " + str(midPos))
return midPos
else:
print("The name was not in the list.")
Desired outcome
bin_search(name_list,"A")
Prints all the names starting with A (Adolphus of HelBorne, Aldric Foxe .... etc)
EDIT:
I was just doing some guess and check and found out how to do it. This is the solution code
def bin_search(list, item):
low_b = 0
up_b = len(list) - 1
true_list = []
count = 100
while low_b <= up_b and count > 0:
midPos = ((low_b + up_b) // 2)
if list[midPos][0] == item:
true_list.append(list[midPos])
list.remove(list[midPos])
count -= 1
elif list[midPos] < item:
low_b = midPos + 1
count -= 1
else:
up_b = midPos - 1
count -= 1
print(true_list)
Not too sure if this is what you want as it seems inefficient... as you mention it seems a lot more intuitive to just iterate over the entire list but using binary search i found here i have:
def binary_search(seq, t):
min = 0
max = len(seq) - 1
while True:
if max < min:
return -1
m = (min + max) // 2
if seq[m][0] < t:
min = m + 1
elif seq[m][0] > t:
max = m - 1
else:
return m
index=0
while True:
index=binary_search(name_list,"A")
if index!=-1:
print(name_list[index])
else:
break
del name_list[index]
Output i get:
Aphra the Vicious
Arachne the Gruesome
Amanita Maleficant
Astarte Hellebore
Aldric Foxe
Adolphus of Helborne
You just need to found one item starting with the letter, then you need to identify the range. This approach should be fast and memory efficient.
def binary_search(list,item):
low_b = 0
up_b = len(list) - 1
found = False
midPos = ((low_b + up_b) // 2)
if list[low_b][0]==item:
midPos=low_b
found=True
elif list[up_b][0]==item:
midPos = up_b
found=True
while True:
if found:
break;
if list[low_b][0]>item:
break
if list[up_b][0]<item:
break
if up_b<low_b:
break;
midPos = ((low_b + up_b) // 2)
if list[midPos][0] < item:
low_b = midPos + 1
elif list[midPos] > item:
up_b = midPos - 1
else:
found = True
break
if found:
while True:
if midPos>0:
if list[midPos][0]==item:
midPos=midPos-1
continue
break;
while True:
if midPos<len(list):
if list[midPos][0]==item:
print list[midPos]
midPos=midPos+1
continue
break
else:
print("The name was not in the list.")
the output is
>>> binary_search(name_list,"A")
Adolphus of Helborne
Aldric Foxe
Amanita Maleficant
Aphra the Vicious
Arachne the Gruesome
Astarte Hellebore
I've been playing around with the Boyer-Moore sting search algorithm and starting with a base code set from Shriphani Palakodety I created 2 additional versions (v2 and v3) - each making some modifications such as removing len() function from the loop and than refactoring the while/if conditions. From v1 to v2 I see about a 10%-15% improvement and from v1 to v3 a 25%-30% improvement (significant).
My question is: does anyone have any additional mods that would improve performance even more (if you can submit as a v4) - keeping the base 'algorithm' true to Boyer-Moore.
#!/usr/bin/env python
import time
bcs = {} #the table
def goodSuffixShift(key):
for i in range(len(key)-1, -1, -1):
if key[i] not in bcs.keys():
bcs[key[i]] = len(key)-i-1
#---------------------- v1 ----------------------
def searchv1(text, key):
"""base from Shriphani Palakodety fixed for single char"""
i = len(key)-1
index = len(key) -1
j = i
while True:
if i < 0:
return j + 1
elif j > len(text):
return "not found"
elif text[j] != key[i] and text[j] not in bcs.keys():
j += len(key)
i = index
elif text[j] != key[i] and text[j] in bcs.keys():
j += bcs[text[j]]
i = index
else:
j -= 1
i -= 1
#---------------------- v2 ----------------------
def searchv2(text, key):
"""removed string len functions from loop"""
len_text = len(text)
len_key = len(key)
i = len_key-1
index = len_key -1
j = i
while True:
if i < 0:
return j + 1
elif j > len_text:
return "not found"
elif text[j] != key[i] and text[j] not in bcs.keys():
j += len_key
i = index
elif text[j] != key[i] and text[j] in bcs.keys():
j += bcs[text[j]]
i = index
else:
j -= 1
i -= 1
#---------------------- v3 ----------------------
def searchv3(text, key):
"""from v2 plus modified 3rd if condition
breaking down the comparison for efficiency,
modified the while loop to include the first
if condition (opposite of it)
"""
len_text = len(text)
len_key = len(key)
i = len_key-1
index = len_key -1
j = i
while i >= 0 and j <= len_text:
if text[j] != key[i]:
if text[j] not in bcs.keys():
j += len_key
i = index
else:
j += bcs[text[j]]
i = index
else:
j -= 1
i -= 1
if j > len_text:
return "not found"
else:
return j + 1
key_list = ["POWER", "HOUSE", "COMP", "SCIENCE", "SHRIPHANI", "BRUAH", "A", "H"]
text = "SHRIPHANI IS A COMPUTER SCIENCE POWERHOUSE"
t1 = time.clock()
for key in key_list:
goodSuffixShift(key)
#print searchv1(text, key)
searchv1(text, key)
bcs = {}
t2 = time.clock()
print('v1 took %0.5f ms' % ((t2-t1)*1000.0))
t1 = time.clock()
for key in key_list:
goodSuffixShift(key)
#print searchv2(text, key)
searchv2(text, key)
bcs = {}
t2 = time.clock()
print('v2 took %0.5f ms' % ((t2-t1)*1000.0))
t1 = time.clock()
for key in key_list:
goodSuffixShift(key)
#print searchv3(text, key)
searchv3(text, key)
bcs = {}
t2 = time.clock()
print('v3 took %0.5f ms' % ((t2-t1)*1000.0))
Using "in bcs.keys()" is creating a list and then doing an O(N) search of the list -- just use "in bcs".
Do the goodSuffixShift(key) thing inside the search function. Two benefits: the caller has only one API to use, and you avoid having bcs as a global (horrid ** 2).
Your indentation is incorrect in several places.
Update
This is not the Boyer-Moore algorithm (which uses TWO lookup tables). It looks more like the Boyer-Moore-Horspool algorithm, which uses only the first BM table.
A probable speedup: add the line 'bcsget = bcs.get' after setting up the bcs dict. Then replace:
if text[j] != key[i]:
if text[j] not in bcs.keys():
j += len_key
i = index
else:
j += bcs[text[j]]
i = index
with:
if text[j] != key[i]:
j += bcsget(text[j], len_key)
i = index
Update 2 -- back to basics, like getting the code correct before you optimise
Version 1 has some bugs which you have carried forward into versions 2 and 3. Some suggestions:
Change the not-found response from "not found" to -1. This makes it compatible with text.find(key), which you can use to check your results.
Get some more text values e.g. "R" * 20, "X" * 20, and "XXXSCIENCEYYY" for use with your existing key values.
Lash up a test harness, something like this:
func_list = [searchv1, searchv2, searchv3]
def test():
for text in text_list:
print '==== text is', repr(text)
for func in func_list:
for key in key_list:
try:
result = func(text, key)
except Exception, e:
print "EXCEPTION: %r expected:%d func:%s key:%r" % (e, expected, func.__name__, key)
continue
expected = text.find(key)
if result != expected:
print "ERROR actual:%d expected:%d func:%s key:%r" % (result, expected, func.__name__, key)
Run that, fix the errors in v1, carry those fixes forward, run the tests again until they're all OK. Then you can tidy up your timing harness along the same lines, and see what the performance is. Then you can report back here, and I'll give you my idea of what a searchv4 function should look like ;-)