split existed list based on the repeated word - python

I tried to split a list into new list. Here's the initial list:
initList =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word',
'title', 'PTE427', 'how', 'are', 'you']
If I want to split the list based on the PTExyz to new list which looks:
newList = ['PTE123 I am programmer', 'PTE345 based word title', 'PTE427 how are you']
How should I develop proper algorithm for general case with repeated item PTExyz?
Thank You!

The algorithm will be something like this.
Iterate over the list. Find a the string s that starts with PTE. Assign it to a temp string which is initialized as an empty string. Add every next string s with temp unless that string starts with PTE. In that case, if the temp string is not empty then append it with your result list else add the string with temp.
ls = ['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word', 'title', 'PTE427', 'how', 'are', 'you']
result = []
temp = ''
for s in ls:
if s.startswith('PTE'):
if temp != '':
result.append(temp)
temp = s
else:
if temp == '':
continue
temp += ' ' + s
result.append(temp)
print(result)
Edit
For handling the pattern PTExyz you can use regular expression. In that case the code will be like this where the line is s.startswith('PTE'):
re.match(r'PTE\w{3}$', s)

I think it will work
l =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word','title', 'PTE427', 'how', 'are', 'you']
resultlist = []
s = ' '.join(l)
str = s.split('PTE')
for i in str:
resultlist.append('PTE'+i)
resultlist.remove('PTE')
print resultlist

It works on a regular expression PTExyz
import re
l =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word',
'title', 'PTE427', 'how', 'are', 'you']
pattern = re.compile(r'[P][T][E]\d\d\d')
k = []
for i in l:
if pattern.match(i) is not None:
k.append(i)
s = ' '.join(l)
str = re.split(pattern, s)
str.remove('')
for i in range(len(k)):
str[i] = k[i] + str[i]
print str

>>> list =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word','title', 'PTE427', 'how', 'are', 'you']
>>> index_list =[ list.index(item) for item in list if "PTE" in item]
>>> index_list.append(len(list))
>>> index_list
[0, 5, 9, 13]
>>> [' '.join(list[index_list[i-1]:index_list[i]]) for i,item in enumerate(index_list) if item > 0 ]
Output
['PTE123 I am programmer', 'PTE345 based word title', 'PTE427 how are you']

Related

Missing keys from dictionary counting words in list

My output is incomplete. There are 3 element which don't count.
# A programm to count words in a string and put them in a dictionary as key = word and value = count
def word_in_str (S):
dict_s = {} # make a empty dict
s = S.lower() # make string lowercase
l = s.split() # split string into a list and separate theme by spase
print (l) # original list contain all words
for word in l:
counter = l.count (str(word))
print (str(word)) # for testing the code, it's value = count
print (counter) # for testing the code, it's key = word
dict_s[str(word)] = counter
l[:] = (value for value in l if value != str(word)) #delete the word after count it
print (l) # for testing the code, it's the list after deleting the word
print (dict_s) # main print code, but there is no ('when', 'young', 'and') in result
if __name__ == '__main__':
word_in_str ('I am tall when I am young and I am short when I am old')
the output for this code is:
['i', 'am', 'tall', 'when', 'i', 'am', 'young', 'and', 'i', 'am', 'short', 'when', 'i', 'am', 'old']
i
4
['am', 'tall', 'when', 'am', 'young', 'and', 'am', 'short', 'when', 'am', 'old']
tall
1
['am', 'when', 'am', 'young', 'and', 'am', 'short', 'when', 'am', 'old']
am
4
['when', 'young', 'and', 'short', 'when', 'old']
short
1
['when', 'young', 'and', 'when', 'old']
old
1
['when', 'young', 'and', 'when'] <==what happened to this words?
{'i': 4, 'tall': 1, 'am': 4, 'short': 1, 'old': 1} <==result without the words above
I think you're over thinking the problem. A Counter already counts elements of an iterable, and it is a type of dict
from collections import Counter
def word_in_str(S):
return dict(Counter(S.split()))
The problem with your code is that your for loop is over l, but then you're attempting to "delete" and reassign l[:], where you don't really need to. Just count and store the dict entry.
def word_in_str (S):
dict_s = {}
s = S.lower()
l = s.split()
for word in l:
counter = l.count (word)
dict_s[word] = counter
print (dict_s)
word_in_str ('I am tall when I am young and I am short when I am old')

python how to record the number of elements in listA that appear in the same order as in listB

I am trying to trace to what extent is listA, listB, listC... similar to the original list. How do I print the number of elements that occur in the same sequence in listA as they occur in the original list?
original_list = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the' 'in', 'space', 'with', 'my', 'football', 'my','dog']
Output:
listA: Count = 2 #'my', 'dog'
listB: Count = 5 #'I', 'live', 'in', 'space', 'with'
listC: Count = 2,4,2 #'I', 'live'
#'in', 'space', 'with', 'my'
#'my', 'dog'
I wrote a function that does the job I think. It might be a bit too complex, but I can't see an easier way at the moment:
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
def get_sequence_lengths(original_list, comparative_list):
original_options = []
for i in range(len(original_list)):
for j in range(i + 1, len(original_list)):
original_options.append(original_list[i:j + 1])
comparative_options = []
for i in range(len(comparative_list)):
for j in range(i+1, len(comparative_list)):
comparative_options.append(comparative_list[i:j+1])
comparative_options.sort(key=len, reverse=True)
matches = []
while comparative_options:
for option in comparative_options:
if option in original_options:
matches.append(option)
new_comparative_options = comparative_options.copy()
for l in comparative_options:
counter = 0
for v in option:
counter = counter + 1 if v in l else 0
if counter == len(l):
new_comparative_options.remove(l)
break
comparative_options = new_comparative_options
break
if option == comparative_options[-1]:
break
matches = [option for option in original_options if option in matches]
lengths = [len(option) for option in matches]
print(lengths)
print(matches)
return lengths
If you call it with the original list and example lists, it prints the following.
get_sequence_lengths(original, listA) prints [2] [['my', 'dog']].
get_sequence_lengths(original, listB) prints [5] [['I', 'live', 'in', 'space', 'with']].
get_sequence_lengths(original, listC) prints [2, 4, 2] [['I', 'live'], ['in', 'space', 'with', 'my'], ['my', 'dog']].
EDITED
I found this problem fun to do and wanted to explore some other options from the accepted one.
def _get_sequences(inter_dict : dict, list_range : int) -> tuple[set, int]:
occuring = [0] * list_range
for key, indices in inter_dict.items(): # lays out intersecting strings as they occur
for idx in indices:
occuring[idx] = key
_temp_list = []
lengths = []
matches = []
for idx in range(len(occuring)):
item = occuring.pop(0)
if item != 0: # if on python 3.8+ you could use (( item := occuring.pop(0) ) != 0) instead
_temp_list.append(item)
elif (bool(_temp_list) and len(_temp_list) > 1):
matches.append( _temp_list.copy() )
lengths.append( len(_temp_list) )
_temp_list.clear()
elif (bool(_temp_list) and item == 0) and len(_temp_list) == 1: # if its a single occurrence ignore
_temp_list.clear()
if bool(_temp_list) and len(_temp_list) > 1: # ensures no matching strings are missed
matches.append( _temp_list )
lengths.append( len(_temp_list) )
return lengths, matches
def get_intersecting(list_a, list_b) -> tuple[set, int]:
intersecting = set(list_a) & set(list_b) # returns intersecting strings
indices_dict = {}
for item in intersecting:
indices = [ index for index, value in enumerate(list_b) if value == item ] # gets occuring indices of each string
indices_dict[item] = indices
return _get_sequences( indices_dict, len(list_b) )
if __name__ == "__main__":
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
lengths, matches = get_intersecting(original, listA)
print(lengths, matches) # [2] [['my', 'dog']]
lengths, matches = get_intersecting(original, listB)
print(lengths, matches) # [5] [['I', 'live', 'in', 'space', 'with']]
lengths, matches = get_intersecting(original, listC)
print(lengths, matches) # [2, 4, 2] [['I', 'live'] ['in', 'space', 'with', 'my'] ['my', 'dog']]
EDITED x2
This would probably be my final solution.
def ordered_intersecting(list_a, list_b) -> tuple[int, list]:
matches = []
for item in list_b:
if item in list_a: # while iterating we can just add them to a return list as they appear
matches.append(item)
elif len(matches) > 1: # once we come across an item that does not intersect we know we can yield a return value ( as long as matches are greater than 1 )
yield len(matches), matches.copy() ; matches.clear() # a shallow copy should be good enough, but if needed it can be changed to a deep one
if len(matches) > 1: # catch any remaining matches
yield len(matches), matches
if __name__ == "__main__":
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
print( list(ordered_intersecting(original, listA)) )
print( list(ordered_intersecting(original, listB)) )
print( list(ordered_intersecting(original, listC)) )

How to separate a list into two list at '\n'?

I would like to separate a list in different lists at '\n'. For example, if I have a list like this one:
l = ['hi', 'my', 'name', 'is', 'john', '\n', '\n', 'nice', 'to', 'meet', 'you']
I'd like to separate the items this way:
l = [['hi', 'my', 'name', 'is', 'john'], ['nice', 'to', 'meet', 'you']]
Can someone help me?
Some code that I tried to write:
l = ['hi', 'my', 'name', 'is', 'john', '\n', '\n', 'nice', 'to', 'meet', 'you']
lst = []
ls = []
for word in l:
if word != '\n':
ls.append(l)
else:
lst.append(ls)
print(lst)
I think you just wanted to append word to the list ls. Also, clear the partial list at the newlines like so:
lst = []
ls = []
for word in l:
if word != '\n':
ls.append(word)
else:
if len(ls) > 0:
lst.append(ls)
ls = []
if len(ls) > 0:
lst.append(ls)
print(lst)
resulting in
[['hi', 'my', 'name', 'is', 'john'], ['nice', 'to', 'meet', 'you']]
You could use itertools.groupby:
>>> from itertools import groupby
>>> l = ['hi', 'my', 'name', 'is', 'john', '\n', '\n', 'nice', 'to', 'meet', 'you']
>>> l = [list(group) for key, group in groupby(l, lambda s: s != '\n') if key]
>>> l
[['hi', 'my', 'name', 'is', 'john'], ['nice', 'to', 'meet', 'you']]

Modifying N+1th element in an array at Nth iteration?

I am currently doing a data analysis project involving text mining. As of now, I am stuck on filtering out certain phrases.
Suppose I have this tokenized array of words
arr = ['hello' ',' , 'how', 'is' , 'your', 'day', 'going', '?' , '#', 'HelloWorld']
(hello, how is your day going? #HelloWorld)
and I want to remove the #HelloWorld from the sentence.
My original logic was traverse through the array and check for the # , once it the # has been found, I would replace the # and the element after the # with a blank space as followed:
N = 0
for index to arr:
if arr[N] == '#':
arr[N] = (' ')
arr[N+1] = (' ')
N += 1
unfortunately, I got the error list assignment index out of range at line 5. I tried to use the .append() but it only allows modification at N .
Is there another approach to this?
This should work, like the others said, you need to check when you are at the end of the list.
EDIT: simplify !
arr = ['a', 'b', '#', 'aa']
indices = [idx for idx, elt in enumerate(arr) if elt == '#']
for idx in indices:
if idx != len(arr): arr[idx+1] = ' ' # Check if not at the end of the list
arr[idx] = ' '
Your code will try to access outside the array when the last element is #, so you need to check for that.
There's also no need to use a separate variable for iteration and indexing, just iterate over the range of indexes.
for i in range(len(arr)):
if arr[i] == '#':
arr[i] = ' '
if i < len(arr)-2:
arr[i+1] = ' '
The root cause of your codes is 'N+1' will be out of range when loop to the end of the list.
If one element must exist following one '#', try below:
arr = ['hello' ',' , 'how', 'is' , 'your', 'day', 'going', '?' , '#', 'HelloWorld']
for index in range(0, len(arr)):
if arr[index] == '#':
arr[index:index+2] = ['', '']
print (arr)
Output:
['hello,', 'how', 'is', 'your', 'day', 'going', '?', '', '']
[Finished in 0.133s]
if the array is end with '#', it will still replace '#' with ['',''] ( I am not sure whether this result is as you expected.
arr = ['hello' ',' , 'how', 'is' , 'your', 'day', 'going', '?' , '#', 'HelloWorld', '#']
for index in range(0, len(arr)):
if arr[index] == '#':
arr[index:index+2] = ['', '']
print (arr)
Output:
['hello,', 'how', 'is', 'your', 'day', 'going', '?', '', '', '', '']
[Finished in 0.179s]

how to turn string of words into list

i have turned a list of words into a string
now i want to turn them back into a list but i dont know how, please help
temp = ['hello', 'how', 'is', 'your', 'day']
temp_string = str(temp)
temp_string will then be "[hello, how, is, your, day]"
i want to turn this back into a list now but when i do list(temp_string), this will happen
['[', "'", 'h', 'e', 'l', 'l', 'o', "'", ',', ' ', "'", 'h', 'o', 'w', "'", ',', ' ', "'", 'i', 's', "'", ',', ' ', "'", 'y', 'o', 'u', 'r', "'", ',', ' ', "'", 'd', 'a', 'y', "'", ']']
Please help
You can do this easily by evaluating the string. That's not something I'd normally suggest but, assuming you control the input, it's quite safe:
>>> temp = ['hello', 'how', 'is', 'your', 'day'] ; type(temp) ; temp
<class 'list'>
['hello', 'how', 'is', 'your', 'day']
>>> tempstr = str(temp) ; type(tempstr) ; tempstr
<class 'str'>
"['hello', 'how', 'is', 'your', 'day']"
>>> temp2 = eval(tempstr) ; type(temp2) ; temp2
<class 'list'>
['hello', 'how', 'is', 'your', 'day']
Duplicate question? Converting a String to a List of Words?
Working code below (Python 3)
import re
sentence_list = ['hello', 'how', 'are', 'you']
sentence = ""
for word in sentence_list:
sentence += word + " "
print(sentence)
#output: "hello how are you "
word_list = re.sub("[^\w]", " ", sentence).split()
print(word_list)
#output: ['hello', 'how', 'are', 'you']
You can split on commas and join them back together:
temp = ['hello', 'how', 'is', 'your', 'day']
temp_string = str(temp)
temp_new = ''.join(temp_string.split(','))
The join() function takes a list, which is created from the split() function while using ',' as the delimiter. join() will then construct a string from the list.

Categories

Resources