Matching case words in strings python - python

I'm trying to write a function that check if a word is in string OR the word has len(word)-1 chars in common with each word in the string.
For example:
word: match string: There is a match -> True
word: matck string: There is a match -> True
The output need to be True for both examples because matck-1=matc and match-1=matc
I have wrote the below code so far:
for idx, f in enumerate(files):
for word in words:
if term in f:
numOfWord[idx] += 1
else:
file_words = f.split()
for f_word in file_words:
if word[:-1] == file_word[:-1]:
numOfWords[idx] += 1
But it's not good because I have a very big list of word and very big dir of long files so the run time is not realistic.

You can use Levenshtein distance to check that
def minimumEditDistance(s1,s2):
if len(s1) > len(s2):
s1,s2 = s2,s1
distances = range(len(s1) + 1)
for index2,char2 in enumerate(s2):
newDistances = [index2+1]
for index1,char1 in enumerate(s1):
if char1 == char2:
newDistances.append(distances[index1])
else:
newDistances.append(1 + min((distances[index1],
distances[index1+1],
newDistances[-1])))
distances = newDistances
return distances[-1]
print(minimumEditDistance("kitten","sitting"))
print(minimumEditDistance("rosettacode","raisethysword"))
https://rosettacode.org/wiki/Levenshtein_distance#Python

Related

How to split word to ngrams in Python?

I've got this question. I should split word to ngrams (for example: word ADVENTURE has three 4grams - ADVE; ENTU; TURE). There is a book file document (that's the reason for counter and isalpha), which is I don't have here, so I'm using only a list of 2 words. This is my code in Python:
words = ['adven', 'adventure']
def ngrams(words, n):
counter = {}
for word in words:
if (len(word)-1) >= n:
for i in range(0, len(word)):
if word.isalpha() == True:
ngram = ""
for i in range(len(word)):
ngram += word[i:n:]
if len(ngram) == n:
ngram.join(counter)
counter[ngram] = counter.get(ngram, 0) + 1
return counter
print(trotl(words, 4))
This is what the code gives me:
{'adve': 14}
I don't care about the values in it but I'm not so good at strings and I don't know what I should do to gives me the three 4grams. I try to do "ngram += word[i::]" but that gives me None. Please help me, this is my school homework and I can't do more functions when this ngrams doesn't work.
use nltk.ngrams for this job:
from nltk import ngrams
I think the definition you have of n-grams is a little bit different from the conventional, as pointed out by #Stuart in his comment. However, with the definition from your comment, I think the following would solve your problem.
def n_grams(word, n):
# We can't find n-grams if the word has less than n letters.
if n > len(word):
return []
output = []
start_idx = 0
end_idx = start_idx + n
# Grab all n-grams except the last one
while end_idx < len(word):
n_gram = word[start_idx:end_idx]
output.append(n_gram)
start_idx = end_idx - 1
end_idx = start_idx + n
# Grab the last n-gram
last_n_gram_start = len(word) - n
last_n_gram_end = len(word)
output.append(word[last_n_gram_start:last_n_gram_end])
return output
If I've understood the rules correctly, you can do it like this
def special_ngrams(word, n):
""" Yield character ngrams of word that overlap by only one character,
except for the last two ngrams which may overlap by more than one
character. The first and last ngrams of the word are always included. """
for start in range(0, len(word) - n, n - 1):
yield word[start:start + n]
yield word[-n:]
for word in "hello there this is a test", "adventure", "tyrannosaurus", "advent":
print(list(special_ngrams(word, 4)))

Longest Prefix That is Also a Substring in Second String

This code (adapted from a Prefix-Suffix code) is quite slow for larger corpora:
s1 = 'gafdggeg'
s2 = 'adagafrd'
Output: gaf
def pref_also_substr(s):
n = len(s)
for res in range(n, 0, -1):
prefix = s[0: res]
if (prefix in s1):
return res
# if no prefix and string2 match occurs
return 0
Any option for an efficient alternative?
I have another approach to solve this question. First you can find all substrings of s2 and replace the key in dictionary d with highest size.
s2 = "'adagafrd'"
# Get all substrings of string
# Using list comprehension + string slicing
substrings = [test_str[i: j] for i in range(len(test_str))
for j in range(i + 1, len(test_str) + 1)]
Now you can use startswith() function to check longest prefix from this list of substring and compare the size of substring.
s1 = 'gafdggeg'
d={}
for substring in substrings:
if s1.startswith(substring):
if not d:
d[substring]=len(substring)
else:
if len(substring)>list(d.values())[0]:
d={}
d[substring]=len(substring)
print(d)
Output:
{'gaf': 3}
def f(s1, s2):
for i in range(len(s1)):
i += 1
p = s1[:i]
if p in s2:
s2 = s2[s2.index(p):]
else:
return i - 1
Check the prefixes starting from length 1.
If find a prefix, discard the chars behind the prefix founded and continue searching.

How to create a list from results from a given list within a for loop?

I am doing googles python class. And came across this problem:
# A. match_ends
# Given a list of strings, return the count of the number of
# strings where the string length is 2 or more and the first
# and last chars of the string are the same.
# Note: python does not have a ++ operator, but += works.
I tried different approaches, but cant seem to get it to work. This is what i got now:
def match_ends(words):
words=sorted(words, key=len)
for i in words:
if len(i)<2:
print(i)
words=words[1:]
print(words)
for i in words:
if i[0:2]==i[-2:]:
x=[]
x.append[i]
How is this done?
Easy to accomplish using sum and a generator expression:
def match_ends(words):
return sum(len(word) >= 2 and word[0] == word[-1] for word in words)
You could simply do this:
def match_ends(words):
count = 0
for word in words:
if len(word) >= 2 and word[0] == word[-1]:
count += 1
return count
A more pythonic solution might be
def func(s):
return len(s) >= 2 and s[0] == s[-1]
str_list = ['applea', 'b', 'cardc']
filtered_list = [s for s in str_list if (len(s) >= 2 and s[0] == s[-1])]
# or
filtered_list = list(filter(func, str_list))
count = len(filtered_list)
pretty much the same as previous answers, but lambda
match_ends = lambda ws: sum(1 for w in ws if len(w)>1 and w[0] == w[-1])
or 'expanded' form
match_ends = lambda words: sum(1 for word in words if len(word)>1 and word[0] == word[-1])

Removing all instances of the second string from the first

The question states: Write code that takes two strings from the user, and returns what is left over if all instances of the second string is removed from the first. The second string is guaranteed to be no longer than two characters.
I started off with the following:
def remove(l1,l2):
string1 = l1
string2 = l2
result = ""
ctr = 0
while ctr < len(l1):
Since it cannot be longer than 2 characters I think I have to put in an if function as such:
if len(sub) == 2:
if (ctr + 1) < len(string) and string[ctr] == sub[0]
You could just use the replace method to remove all occurrences of the the second string from the first:
def remove(s1, s2):
return s1.replace(s2, "")
print remove("hello this is a test", "l")
For a manual method, you can use:
def remove(s1, s2):
newString = []
if len(s2) > 2:
return "The second argument cannot exceed two characters"
for c in s1:
if c not in s2:
newString.append(c)
return "".join(newString)
print remove("hello this is a test", "l")
Yields: heo this is a test
The code looks like this:
def remove(l1,l2):
string1 = l1
string2 = l2
ctr = 0
result = ""
while ctr < len(string1):
if string1[ctr : ctr + len(string2)] == string2:
ctr += len(string2)
else:
result += string1[ctr]
ctr += 1
return result
I got it resolved; just took me a little bit of time.
You could use list comprehension:
st1 = "Hello how are you"
st2 = "This is a test"
st3 = [i for i in st1 if i not in st2]
print ''.join(st3)
Using solely the slice method:
def remove_all(substr,theStr):
num=theStr.count(substr)
for i in range(len(theStr)):
finalStr=""
if theStr.find(substr)<0:
return theStr
elif theStr[i:i+len(substr)]==substr:
return theStr[0:i]+ theStr[i+len(substr*num):len(theStr)]
s1= input()
s2= input()
#get length of each string
l_s1,l_s2= len(s1),len(s2)
#list to store the answer
ans= list()
i=0
#check if more characters are left
#in s1 to b compared
#and length of substring of s1 remaining to
#be compared must be greater than or equal
#to the length of s2
while i<l_s1 and l_s1-i>=l_s2:
j=0
#compare the substring from s1 with s2
while j<l_s2 and s1[i+j]==s2[j]:
j+=1
#if string matches
#discard that substring of s1
#from solution
#and update the pointer i
#accordingly
if j==l_s2:
i+=j
#otherwise append the ith character to
#ans list
else:
ans.append(s1[i])
i+=1
#append if any characters remaining
while i<l_s1:
ans.append(s1[i])
i+=1
print(''.join(ans))
'''
Sample Testcase
1.
kapil
kd
kapil
2.
devansh
dev
ansh
3.
adarsh
ad
arsh
'''

Finding longest substring in alphabetical order

EDIT: I am aware that a question with similar task was already asked in SO but I'm interested to find out the problem in this specific piece of code. I am also aware that this problem can be solved without using recursion.
The task is to write a program which will find (and print) the longest sub-string in which the letters occur in alphabetical order. If more than 1 equally long sequences were found, then the first one should be printed. For example, the output for a string abczabcd will be abcz.
I have solved this problem with recursion which seemed to pass my manual tests. However when I run an automated tests set which generate random strings, I have noticed that in some cases, the output is incorrect. For example:
if s = 'hixwluvyhzzzdgd', the output is hix instead of luvy
if s = 'eseoojlsuai', the output is eoo instead of jlsu
if s = 'drurotsxjehlwfwgygygxz', the output is dru instead of ehlw
After some time struggling, I couldn't figure out what is so special about these strings that causes the bug.
This is my code:
pos = 0
maxLen = 0
startPos = 0
endPos = 0
def last_pos(pos):
if pos < (len(s) - 1):
if s[pos + 1] >= s[pos]:
pos += 1
if pos == len(s)-1:
return len(s)
else:
return last_pos(pos)
return pos
for i in range(len(s)):
if last_pos(i+1) != None:
diff = last_pos(i) - i
if diff - 1 > maxLen:
maxLen = diff
startPos = i
endPos = startPos + diff
print s[startPos:endPos+1]
There are many things to improve in your code but making minimum changes so as to make it work. The problem is you should have if last_pos(i) != None: in your for loop (i instead of i+1) and you should compare diff (not diff - 1) against maxLen. Please read other answers to learn how to do it better.
for i in range(len(s)):
if last_pos(i) != None:
diff = last_pos(i) - i + 1
if diff > maxLen:
maxLen = diff
startPos = i
endPos = startPos + diff - 1
Here. This does what you want. One pass, no need for recursion.
def find_longest_substring_in_alphabetical_order(s):
groups = []
cur_longest = ''
prev_char = ''
for c in s.lower():
if prev_char and c < prev_char:
groups.append(cur_longest)
cur_longest = c
else:
cur_longest += c
prev_char = c
return max(groups, key=len) if groups else s
Using it:
>>> find_longest_substring_in_alphabetical_order('hixwluvyhzzzdgd')
'luvy'
>>> find_longest_substring_in_alphabetical_order('eseoojlsuai')
'jlsu'
>>> find_longest_substring_in_alphabetical_order('drurotsxjehlwfwgygygxz')
'ehlw'
Note: It will probably break on strange characters, has only been tested with the inputs you suggested. Since this is a "homework" question, I will leave you with the solution as is, though there is still some optimization to be done, I wanted to leave it a little bit understandable.
You can use nested for loops, slicing and sorted. If the string is not all lower-case then you can convert the sub-strings to lower-case before comparing using str.lower:
def solve(strs):
maxx = ''
for i in xrange(len(strs)):
for j in xrange(i+1, len(strs)):
s = strs[i:j+1]
if ''.join(sorted(s)) == s:
maxx = max(maxx, s, key=len)
else:
break
return maxx
Output:
>>> solve('hixwluvyhzzzdgd')
'luvy'
>>> solve('eseoojlsuai')
'jlsu'
>>> solve('drurotsxjehlwfwgygygxz')
'ehlw'
Python has a powerful builtin package itertools and a wonderful function within groupby
An intuitive use of the Key function can give immense mileage.
In this particular case, you just have to keep a track of order change and group the sequence accordingly. The only exception is the boundary case which you have to handle separately
Code
def find_long_cons_sub(s):
class Key(object):
'''
The Key function returns
1: For Increasing Sequence
0: For Decreasing Sequence
'''
def __init__(self):
self.last_char = None
def __call__(self, char):
resp = True
if self.last_char:
resp = self.last_char < char
self.last_char = char
return resp
def find_substring(groups):
'''
The Boundary Case is when an increasing sequence
starts just after the Decresing Sequence. This causes
the first character to be in the previous group.
If you do not want to handle the Boundary Case
seperately, you have to mak the Key function a bit
complicated to flag the start of increasing sequence'''
yield next(groups)
try:
while True:
yield next(groups)[-1:] + next(groups)
except StopIteration:
pass
groups = (list(g) for k, g in groupby(s, key = Key()) if k)
#Just determine the maximum sequence based on length
return ''.join(max(find_substring(groups), key = len))
Result
>>> find_long_cons_sub('drurotsxjehlwfwgygygxz')
'ehlw'
>>> find_long_cons_sub('eseoojlsuai')
'jlsu'
>>> find_long_cons_sub('hixwluvyhzzzdgd')
'luvy'
Simple and easy.
Code :
s = 'hixwluvyhzzzdgd'
r,p,t = '','',''
for c in s:
if p <= c:
t += c
p = c
else:
if len(t) > len(r):
r = t
t,p = c,c
if len(t) > len(r):
r = t
print 'Longest substring in alphabetical order is: ' + r
Output :
Longest substring in alphabetical order which appeared first: luvy
Here is a single pass solution with a fast loop. It reads each character only once. Inside the loop operations are limited to
1 string comparison (1 char x 1 char)
1 integer increment
2 integer subtractions
1 integer comparison
1 to 3 integer assignments
1 string assignment
No containers are used. No function calls are made. The empty string is handled without special-case code. All character codes, including chr(0), are properly handled. If there is a tie for the longest alphabetical substring, the function returns the first winning substring it encountered. Case is ignored for purposes of alphabetization, but case is preserved in the output substring.
def longest_alphabetical_substring(string):
start, end = 0, 0 # range of current alphabetical string
START, END = 0, 0 # range of longest alphabetical string yet found
prev = chr(0) # previous character
for char in string.lower(): # scan string ignoring case
if char < prev: # is character out of alphabetical order?
start = end # if so, start a new substring
end += 1 # either way, increment substring length
if end - start > END - START: # found new longest?
START, END = start, end # if so, update longest
prev = char # remember previous character
return string[START : END] # return longest alphabetical substring
Result
>>> longest_alphabetical_substring('drurotsxjehlwfwgygygxz')
'ehlw'
>>> longest_alphabetical_substring('eseoojlsuai')
'jlsu'
>>> longest_alphabetical_substring('hixwluvyhzzzdgd')
'luvy'
>>>
a lot more looping, but it gets the job done
s = raw_input("Enter string")
fin=""
s_pos =0
while s_pos < len(s):
n=1
lng=" "
for c in s[s_pos:]:
if c >= lng[n-1]:
lng+=c
n+=1
else :
break
if len(lng) > len(fin):
fin= lng`enter code here`
s_pos+=1
print "Longest string: " + fin
def find_longest_order():
`enter code here`arr = []
`enter code here`now_long = ''
prev_char = ''
for char in s.lower():
if prev_char and char < prev_char:
arr.append(now_long)
now_long = char
else:
now_long += char
prev_char = char
if len(now_long) == len(s):
return now_long
else:
return max(arr, key=len)
def main():
print 'Longest substring in alphabetical order is: ' + find_longest_order()
main()
Simple and easy to understand:
s = "abcbcd" #The original string
l = len(s) #The length of the original string
maxlenstr = s[0] #maximum length sub-string, taking the first letter of original string as value.
curlenstr = s[0] #current length sub-string, taking the first letter of original string as value.
for i in range(1,l): #in range, the l is not counted.
if s[i] >= s[i-1]: #If current letter is greater or equal to previous letter,
curlenstr += s[i] #add the current letter to current length sub-string
else:
curlenstr = s[i] #otherwise, take the current letter as current length sub-string
if len(curlenstr) > len(maxlenstr): #if current cub-string's length is greater than max one,
maxlenstr = curlenstr; #take current one as max one.
print("Longest substring in alphabetical order is:", maxlenstr)
s = input("insert some string: ")
start = 0
end = 0
temp = ""
while end+1 <len(s):
while end+1 <len(s) and s[end+1] >= s[end]:
end += 1
if len(s[start:end+1]) > len(temp):
temp = s[start:end+1]
end +=1
start = end
print("longest ordered part is: "+temp)
I suppose this is problem set question for CS6.00.1x on EDX. Here is what I came up with.
s = raw_input("Enter the string: ")
longest_sub = ""
last_longest = ""
for i in range(len(s)):
if len(last_longest) > 0:
if last_longest[-1] <= s[i]:
last_longest += s[i]
else:
last_longest = s[i]
else:
last_longest = s[i]
if len(last_longest) > len(longest_sub):
longest_sub = last_longest
print(longest_sub)
I came up with this solution
def longest_sorted_string(s):
max_string = ''
for i in range(len(s)):
for j in range(i+1, len(s)+1):
string = s[i:j]
arr = list(string)
if sorted(string) == arr and len(max_string) < len(string):
max_string = string
return max_string
Assuming this is from Edx course:
till this question, we haven't taught anything about strings and their advanced operations in python
So, I would simply go through the looping and conditional statements
string ="" #taking a plain string to represent the then generated string
present ="" #the present/current longest string
for i in range(len(s)): #not len(s)-1 because that totally skips last value
j = i+1
if j>= len(s):
j=i #using s[i+1] simply throws an error of not having index
if s[i] <= s[j]: #comparing the now and next value
string += s[i] #concatinating string if above condition is satisied
elif len(string) != 0 and s[i] > s[j]: #don't want to lose the last value
string += s[i] #now since s[i] > s[j] #last one will be printed
if len(string) > len(present): #1 > 0 so from there we get to store many values
present = string #swapping to largest string
string = ""
if len(string) > len(present): #to swap from if statement
present = string
if present == s[len(s)-1]: #if no alphabet is in order then first one is to be the output
present = s[0]
print('Longest substring in alphabetical order is:' + present)
I agree with #Abhijit about the power of itertools.groupby() but I took a simpler approach to (ab)using it and avoided the boundary case problems:
from itertools import groupby
LENGTH, LETTERS = 0, 1
def longest_sorted(string):
longest_length, longest_letters = 0, []
key, previous_letter = 0, chr(0)
def keyfunc(letter):
nonlocal key, previous_letter
if letter < previous_letter:
key += 1
previous_letter = letter
return key
for _, group in groupby(string, keyfunc):
letters = list(group)
length = len(letters)
if length > longest_length:
longest_length, longest_letters = length, letters
return ''.join(longest_letters)
print(longest_sorted('hixwluvyhzzzdgd'))
print(longest_sorted('eseoojlsuai'))
print(longest_sorted('drurotsxjehlwfwgygygxz'))
print(longest_sorted('abcdefghijklmnopqrstuvwxyz'))
OUTPUT
> python3 test.py
luvy
jlsu
ehlw
abcdefghijklmnopqrstuvwxyz
>
s = 'azcbobobegghakl'
i=1
subs=s[0]
subs2=s[0]
while(i<len(s)):
j=i
while(j<len(s)):
if(s[j]>=s[j-1]):
subs+=s[j]
j+=1
else:
subs=subs.replace(subs[:len(subs)],s[i])
break
if(len(subs)>len(subs2)):
subs2=subs2.replace(subs2[:len(subs2)], subs[:len(subs)])
subs=subs.replace(subs[:len(subs)],s[i])
i+=1
print("Longest substring in alphabetical order is:",subs2)
s = 'gkuencgybsbezzilbfg'
x = s.lower()
y = ''
z = [] #creating an empty listing which will get filled
for i in range(0,len(x)):
if i == len(x)-1:
y = y + str(x[i])
z.append(y)
break
a = x[i] <= x[i+1]
if a == True:
y = y + str(x[i])
else:
y = y + str(x[i])
z.append(y) # fill the list
y = ''
# search of 1st longest string
L = len(max(z,key=len)) # key=len takes length in consideration
for i in range(0,len(z)):
a = len(z[i])
if a == L:
print 'Longest substring in alphabetical order is:' + str(z[i])
break
first_seq=s[0]
break_seq=s[0]
current = s[0]
for i in range(0,len(s)-1):
if s[i]<=s[i+1]:
first_seq = first_seq + s[i+1]
if len(first_seq) > len(current):
current = first_seq
else:
first_seq = s[i+1]
break_seq = first_seq
print("Longest substring in alphabetical order is: ", current)

Categories

Resources