sentence = 'Cunning fox peels apples.'.strip('.')
def longest_word(target):
set = max(target.split(), key=len)
temp = [x for x in set]
count = 0
for i in range(len(temp)):
if temp[i].isalpha() == True:
count += 1
return set,count
print(longest_word(sentence))
The code works if the longest word in a sentence is strictly longer in symbols than any other, however, how should I adjust the code if the sentence is something like:
sentence = 'Black bananas and green tomatos are red.'
How can I return that there are n words that are equally long? Obviously it's enough to count the symbols in one of the words, but the:
set = max(sentence.split(),key=len)
returns only the first of the longest words.
Use the itertools module. It has a groupby() function that can be used to group an iterator based on a custom-defined function, in this case len():
>>> sentence = 'Black bananas and green tomatos are red.'
>>> words = sorted(sentence.strip(".").split(), key=len)
>>> groups = [list(g) for k,g in itertools.groupby(words, len)]
>>> groups
[['and', 'are', 'red'], ['Black', 'green'], ['bananas', 'tomatos']]
>>> groups[-1]
['bananas', 'tomatos']
You probably need to make two passes along the list of targets, once to get the maximum length, and then to select all words whose length matches the maximum:
def longest_words(targets):
targets = targets.split()
max_len = max(len(item) for item in targets)
return set(item for item in targets if len(item) == max_len)
Quick test:
In [17]: sentence = 'Black bananas and green tomatos are red.'
In [18]: longest_words(sentence.strip('.'))
Out[18]: {'bananas', 'tomatos'}
You can first get maximum length, and check and fetch each word of the same length:
sentence = 'Black bananas and green tomatos are red.'.strip('.')
def longest_word(target):
words = target.split()
max_len = max([len(w) for w in words])
return max_len, [w for w in words if len(w)==max_len]
print(longest_word(sentence))
You can also define a customized len function to count only characters,
def word_len(word):
return len([c for c in word if c.isalpha()])
and replace len with word_len in the previous example.
My function returns a list with the longest word or words.
def longest(sentence):
ordered = sorted(sentence.split(), key=len)
l = len(ordered[-1]) # the last, longest element
m = [ordered[-1]]
for elt in ordered[-2::-1]:
if len(elt) < l : return m
m.append(elt)
return m
Related
a = ['ab', 'absa', 'sbaa', 'basa', 'ba']
res = []
s = 0
for i in range(len(a)):
b=a[i]
c = ''.join(sorted(b))
res.append(c)
res.sort(reverse=False)
wordfreq = [res.count(p) for p in res]
d = dict(zip(res, wordfreq))
all_values = d.values() #all_values is a list
max_value = max(all_values)
print(max_value)
max_key = max(d, key=d.get)
print(max_key)
In the given problem user inputs various anagram words, the output should be the maximum frequency of that word and print those anagrams.
If you please help me print those anagrams from the input it will be really helpful.
Ooutput:
3 aabs
Expected Ooutput:
3
absa sbaa basa
You can create a dictionary of word v/s list of anagrams
and then print out the word which contains the maximum number of elements in the anagram list
from collections import defaultdict
words = ['ab','absa','sbaa','basa','ba']
wordToAnagram= defaultdict(list)
# word vs list anagram
# loop below will create {aabs: ['absa', 'sbaa', 'basa']}
for word in words:
s = "".join(sorted(word))
wordToAnagram[s].append(word)
word, anagrams = max(wordToAnagram.items(), key=lambda x: len(x[1]))
print(" ".join(anagrams))
OUTPUT:
3
absa sbaa basa
Details
wordToAnagrams
After iterating through words
wordToAnagram(dictionary) looks like this
{
"ab" : ["ab", "ba"]
"aabs": ["absa", "sbaa", "base"]
}
dictionary.items()
wordToAnagram.items() returns tuple-pair of dictionary key-value
where,
key: is our sorted string "ab" or "aabs",
value : is list of anagrams, e.g for key = "ab", value equals["ab", "ba"]
dict_items([('ab', ['ab', 'ba']), ('aabs', ['absa', 'sbaa', 'base'])])
max function using 'key' and lambda expression
max(wordToAnagram.items(), key=lambda x: len(x[1]))
finds maximum value from wordToAnagram.items() iterable, by comparing length of anagrams list (len(x[1])
You can try with numpy
and mode from statistics module
import numpy as np
from statistics import mode
words = ['ab','absa','sbaa','basa','ba']
# This sorts the letters of each word, and creates a list of them
sorted_words = [''.join(sorted(word)) for word in words]
max_freq_anagrams = np.array(words)[np.array(sorted_words) == mode(sorted_words)]
# mode(sorted_words) gives you the (sorted) word with the highest frequency
# np.array(sorted_words) == mode(sorted_words) gives you a list of true/false
# and finaly you slice your words by this true/false list
print(len(max_freq_anagrams))
print(list(max_freq_anagrams))
In case you have multiple max frequent words e.g.
words = ['ab','absa','sbaa','basa','ba', 'ba']
then instead of mode(sorted_words) use max(set(sorted_words), key=sorted_words.count) which takes the first most frequent word.
Input a given string and check if any word in that string matches with its reverse in the same string then print that word else print $
I split the string and put the words in a list and then I reversed the words in that list. After that, I couldn't able to compare both the lists.
str = input()
x = str.split()
for i in x: # printing i shows the words in the list
str1 = i[::-1] # printing str1 shows the reverse of words in a new list
# now how to check if any word of the new list matches to any word of the old list
if(i==str):
print(i)
break
else:
print('$)
Input: suman is a si boy.
Output: is ( since reverse of 'is' is present in the same string)
You almost have it, just need to add another loop to compare each word against each inverted word. Try using the following
str = input()
x = str.split()
for i in x:
str1 = i[::-1]
for j in x: # <-- this is the new nested loop you are missing
if j == str1: # compare each inverted word against each regular word
if len(str1) > 1: # Potential condition if you would like to not include single letter words
print(i)
Update
To only print the first occurrence of a match, you could, in the second loop, only check the elements that come after. We can do this by keeping track of the index:
str = input()
x = str.split()
for index, i in enumerate(x):
str1 = i[::-1]
for j in x[index+1:]: # <-- only consider words that are ahead
if j == str1:
if len(str1) > 1:
print(i)
Note that I used index+1 in order to not consider single word palindromes a match.
a = 'suman is a si boy'
# Construct the list of words
words = a.split(' ')
# Construct the list of reversed words
reversed_words = [word[::-1] for word in words]
# Get an intersection of these lists converted to sets
print(set(words) & set(reversed_words))
will print:
{'si', 'is', 'a'}
Another way to do this is just in a list comprehension:
string = 'suman is a si boy'
output = [x for x in string.split() if x[::-1] in string.split()]
print(output)
The split on string creates a list split on spaces. Then the word is included only if the reverse is in the string.
Output is:
['is', 'a', 'si']
One note, you have a variable name str. Best not to do that as str is a Python thing and could cause other issues in your code later on.
If you want word more than one letter long then you can do:
string = 'suman is a si boy'
output = [x for x in string.split() if x[::-1] in string.split() and len(x) > 1]
print(output)
this gives:
['is', 'si']
Final Answer...
And for the final thought, in order to get just the 'is':
string = 'suman is a si boy'
seen = []
output = [x for x in string.split() if x[::-1] not in seen and not seen.append(x) and x[::-1] in string.split() and len(x) > 1]
print(output)
output is:
['is']
BUT, this is not necessarily a good way to do it, I don't believe. Basically you are storing information in seen during the list comprehension AND referencing that same list. :)
This answer wouldn't show you 'a' and won't output 'is' with 'si'.
str = input() #get input string
x = str.split() #returns list of words
y = [] #list of words
while len(x) > 0 :
a = x.pop(0) #removes first item from list and returns it, then assigns it to a
if a[::-1] in x: #checks if the reversed word is in the list of words
#the list doesn't contain that word anymore so 'a' that doesn't show twice wouldn't be returned
#and 'is' that is present with 'si' will be evaluated once
y.append(a)
print(y) # ['is']
I'm trying to find if exists a substring in a list of strings,
FOR EXAMPLE:
I have the list of words ['GGBASDEPINK','ASDEKNIP','PINK','WORDRRAB','BAR']
PINK is a substring of ASDEKNIP , because the reverse of PINK is KNIP
and the word BAR is in WORDRRAB because the reverse is RAB
How to find if substrip is exits? and if yes so put in reverse that string
so the new list should be:
d = ['GGBASDEPINK','ASDEKNIP','PINK','WORDRRAB','BAR' ,'KNIP', 'RAB']
I tried like this
d = ['GGBASDEPINK','ASDEKNIP','PINK','WORDRRAB','BAR']
for word in d:
word = word[::-1]
if word in d:
print(word)
But it gives nothing
Use itertools.permutations:
from itertools import permutations
d = ['GGBASDEPINK','ASDEKNIP','PINK','WORDRRAB','BAR']
for x, y in permutations(d, 2):
rev = y[::-1]
if rev in x:
d.append(rev)
print(d)
# ['GGBASDEPINK', 'ASDEKNIP', 'PINK', 'WORDRRAB', 'BAR', 'KNIP', 'RAB']
I am trying to get a word in a list that is followed by a word with a ''.'' in it. for example, if this is a list
test_list = ["hello", "how", "are.", "you"]
it would select the word ''you'' I have managed to pull this off but I am trying to ensure that I do not get duplicate words.
Here is what I have so far
list = []
i = 0
bool = False
words = sent.split()
for word in words:
if bool:
list.append(word)
bool = False
# the bellow if statment seems to make everything worse instead of fixing the duplicate problem
if "." in word and word not in list:
bool = True
return list
Your whole code can be reduced to this example using zip() and list comprehension:
a = ['hello', 'how', 'are.', 'you']
def get_new_list(a):
return [v for k,v in zip(a, a[1:]) if k.endswith('.')]
Then, to remove the duplicates, if there is any, you can use set(), like this example:
final = set(get_new_list(a))
output:
{'you'}
This isn't based off of the code you posted, however it should do exactly what you're asking.
def get_word_after_dot(words):
for index, word in enumerate(words):
if word.endswith('.') and len(words) - index > 1:
yield words[index + 1]
Iterating over this generator will yield words that are followed by a period.
Here is a different approach to the same problem.
import itertools
from collections import deque
t = deque(map(lambda x: '.' in x, test_list)) # create a deque of bools
>>deque([False, False, True, False])
t.rotate(1) # shift it by one since we want the word after the '.'
>>deque([False, False, False, True])
set(itertools.compress(test_list, t)) # and then grab everywhere it is True
>>{'you'}
In the itertools recipes is the definition of pairwise which is useful to iterating a list 2 at a time:
def pairwise(iterable):
a, b = it.tee(iterable)
next(b, None)
return a, b
You can use this to create a list of words that follow a word ending in '.':
words = [n for m, n in zip(*pairwise(l)) if m[-1] == '.']
Remove duplicates:
seen = set()
results = [x for x in words if not (x in seen or seen.add(x))]
I am writing a python script where I have multiple strings.
For example:
x = "brownasdfoersjumps"
y = "foxsxzxasis12sa[[#brown"
z = "thissasbrownxc-34a#s;"
In all these three strings, they have one sub string in common which is brown. I want to search it in a way that I want to create a dictionary as:
dict = {[commonly occuring substring] =>
[total number of occurrences in the strings provided]}
What would be the best way of doing that? Considering that I will have more than 200 strings each time, what would be an easy/efficient way of doing it?
This is a relatively optimised naïve algorithm. You first transform each sequence into a set of all its ngrams. Then you intersect all sets and find the longest ngram in the intersection.
from functools import partial, reduce
from itertools import chain
from typing import Iterator
def ngram(seq: str, n: int) -> Iterator[str]:
return (seq[i: i+n] for i in range(0, len(seq)-n+1))
def allngram(seq: str) -> set:
lengths = range(len(seq))
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
sequences = ["brownasdfoersjumps",
"foxsxzxasis12sa[[#brown",
"thissasbrownxc-34a#s;"]
seqs_ngrams = map(allngram, sequences)
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len) # -> brown
While this might get you through short sequences, this algorithm is extremely inefficient on long sequences. If your sequences are long, you can add a heuristic to limit the largest possible ngram length (i.e. the longest possible common substring). One obvious value for such a heuristic may be the shortest sequence's length.
def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
lengths = range(minn, maxn) if maxn else range(minn, len(seq))
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
sequences = ["brownasdfoersjumps",
"foxsxzxasis12sa[[#brown",
"thissasbrownxc-34a#s;"]
maxn = min(map(len, sequences))
seqs_ngrams = map(partial(allngram, maxn=maxn), sequences)
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len) # -> brown
This may still take too long (or make your machine run out of RAM), so you might want to read about some optimal algorithms (see the link I left in my comment to your question).
Update
To count the number of strings wherein each ngram occurs
from collections import Counter
sequences = ["brownasdfoersjumps",
"foxsxzxasis12sa[[#brown",
"thissasbrownxc-34a#s;"]
seqs_ngrams = map(allngram, sequences)
counts = Counter(chain.from_iterable(seqs_ngrams))
Counter is a subclass of dict, so its instances have similar interfaces:
print(counts)
Counter({'#': 1,
'#b': 1,
'#br': 1,
'#bro': 1,
'#brow': 1,
'#brown': 1,
'-': 1,
'-3': 1,
'-34': 1,
'-34a': 1,
'-34a#': 1,
'-34a#s': 1,
'-34a#s;': 1,
...
You can filter the counts to leave substrings occurring in at least n strings: {string: count for string, count in counts.items() if count >= n}
I have used a straightforward method to get the common sub sequences from multiple strings. Although the code can be further optimised.
import itertools
def getMaxOccurrence(stringsList, key):
count = 0
for word in stringsList:
if key in word:
count += 1
return count
def getSubSequences(STR):
combs = []
result = []
for l in range(1, len(STR)+1):
combs.append(list(itertools.combinations(STR, l)))
for c in combs:
for t in c:
result.append(''.join(t))
return result
def getCommonSequences(S):
mainList = []
for word in S:
temp = getSubSequences(word)
mainList.extend(temp)
mainList = list(set(mainList))
mainList = reversed(sorted(mainList, key=len))
mainList = list(filter(None, mainList))
finalData = dict()
for alpha in mainList:
val = getMaxOccurrence(S, alpha)
if val > 0:
finalData[alpha] = val
finalData = {k: v for k, v in sorted(finalData.items(), key=lambda item: item[1], reverse=True)}
return finalData
stringsList = ['abc', 'cab', 'dfab', 'xz']
seqs = getCommonSequences(stringsList)
print(seqs)