How to separate a list into two list at '\n'? - python

I would like to separate a list in different lists at '\n'. For example, if I have a list like this one:
l = ['hi', 'my', 'name', 'is', 'john', '\n', '\n', 'nice', 'to', 'meet', 'you']
I'd like to separate the items this way:
l = [['hi', 'my', 'name', 'is', 'john'], ['nice', 'to', 'meet', 'you']]
Can someone help me?
Some code that I tried to write:
l = ['hi', 'my', 'name', 'is', 'john', '\n', '\n', 'nice', 'to', 'meet', 'you']
lst = []
ls = []
for word in l:
if word != '\n':
ls.append(l)
else:
lst.append(ls)
print(lst)

I think you just wanted to append word to the list ls. Also, clear the partial list at the newlines like so:
lst = []
ls = []
for word in l:
if word != '\n':
ls.append(word)
else:
if len(ls) > 0:
lst.append(ls)
ls = []
if len(ls) > 0:
lst.append(ls)
print(lst)
resulting in
[['hi', 'my', 'name', 'is', 'john'], ['nice', 'to', 'meet', 'you']]

You could use itertools.groupby:
>>> from itertools import groupby
>>> l = ['hi', 'my', 'name', 'is', 'john', '\n', '\n', 'nice', 'to', 'meet', 'you']
>>> l = [list(group) for key, group in groupby(l, lambda s: s != '\n') if key]
>>> l
[['hi', 'my', 'name', 'is', 'john'], ['nice', 'to', 'meet', 'you']]

Related

python how to record the number of elements in listA that appear in the same order as in listB

I am trying to trace to what extent is listA, listB, listC... similar to the original list. How do I print the number of elements that occur in the same sequence in listA as they occur in the original list?
original_list = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the' 'in', 'space', 'with', 'my', 'football', 'my','dog']
Output:
listA: Count = 2 #'my', 'dog'
listB: Count = 5 #'I', 'live', 'in', 'space', 'with'
listC: Count = 2,4,2 #'I', 'live'
#'in', 'space', 'with', 'my'
#'my', 'dog'
I wrote a function that does the job I think. It might be a bit too complex, but I can't see an easier way at the moment:
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
def get_sequence_lengths(original_list, comparative_list):
original_options = []
for i in range(len(original_list)):
for j in range(i + 1, len(original_list)):
original_options.append(original_list[i:j + 1])
comparative_options = []
for i in range(len(comparative_list)):
for j in range(i+1, len(comparative_list)):
comparative_options.append(comparative_list[i:j+1])
comparative_options.sort(key=len, reverse=True)
matches = []
while comparative_options:
for option in comparative_options:
if option in original_options:
matches.append(option)
new_comparative_options = comparative_options.copy()
for l in comparative_options:
counter = 0
for v in option:
counter = counter + 1 if v in l else 0
if counter == len(l):
new_comparative_options.remove(l)
break
comparative_options = new_comparative_options
break
if option == comparative_options[-1]:
break
matches = [option for option in original_options if option in matches]
lengths = [len(option) for option in matches]
print(lengths)
print(matches)
return lengths
If you call it with the original list and example lists, it prints the following.
get_sequence_lengths(original, listA) prints [2] [['my', 'dog']].
get_sequence_lengths(original, listB) prints [5] [['I', 'live', 'in', 'space', 'with']].
get_sequence_lengths(original, listC) prints [2, 4, 2] [['I', 'live'], ['in', 'space', 'with', 'my'], ['my', 'dog']].
EDITED
I found this problem fun to do and wanted to explore some other options from the accepted one.
def _get_sequences(inter_dict : dict, list_range : int) -> tuple[set, int]:
occuring = [0] * list_range
for key, indices in inter_dict.items(): # lays out intersecting strings as they occur
for idx in indices:
occuring[idx] = key
_temp_list = []
lengths = []
matches = []
for idx in range(len(occuring)):
item = occuring.pop(0)
if item != 0: # if on python 3.8+ you could use (( item := occuring.pop(0) ) != 0) instead
_temp_list.append(item)
elif (bool(_temp_list) and len(_temp_list) > 1):
matches.append( _temp_list.copy() )
lengths.append( len(_temp_list) )
_temp_list.clear()
elif (bool(_temp_list) and item == 0) and len(_temp_list) == 1: # if its a single occurrence ignore
_temp_list.clear()
if bool(_temp_list) and len(_temp_list) > 1: # ensures no matching strings are missed
matches.append( _temp_list )
lengths.append( len(_temp_list) )
return lengths, matches
def get_intersecting(list_a, list_b) -> tuple[set, int]:
intersecting = set(list_a) & set(list_b) # returns intersecting strings
indices_dict = {}
for item in intersecting:
indices = [ index for index, value in enumerate(list_b) if value == item ] # gets occuring indices of each string
indices_dict[item] = indices
return _get_sequences( indices_dict, len(list_b) )
if __name__ == "__main__":
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
lengths, matches = get_intersecting(original, listA)
print(lengths, matches) # [2] [['my', 'dog']]
lengths, matches = get_intersecting(original, listB)
print(lengths, matches) # [5] [['I', 'live', 'in', 'space', 'with']]
lengths, matches = get_intersecting(original, listC)
print(lengths, matches) # [2, 4, 2] [['I', 'live'] ['in', 'space', 'with', 'my'] ['my', 'dog']]
EDITED x2
This would probably be my final solution.
def ordered_intersecting(list_a, list_b) -> tuple[int, list]:
matches = []
for item in list_b:
if item in list_a: # while iterating we can just add them to a return list as they appear
matches.append(item)
elif len(matches) > 1: # once we come across an item that does not intersect we know we can yield a return value ( as long as matches are greater than 1 )
yield len(matches), matches.copy() ; matches.clear() # a shallow copy should be good enough, but if needed it can be changed to a deep one
if len(matches) > 1: # catch any remaining matches
yield len(matches), matches
if __name__ == "__main__":
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
print( list(ordered_intersecting(original, listA)) )
print( list(ordered_intersecting(original, listB)) )
print( list(ordered_intersecting(original, listC)) )

I want to split elements within a nested list by comma

Below is the list.
List=[['hello', 'how'], ['are', 'you', 'hope'], ['you,are,fine', 'thank', 'you']]
I want the output list as
List=[['hello', 'how'], ['are', 'you', 'hope'], ['you', 'are' ,'fine', 'thank', 'you']]
Try the following nested comprehension that recompiles the list in a single walk-through while splitting the tokens:
>>> [[token for el in sub for token in el.split(',')] for sub in List]
[['hello', 'how'], ['are', 'you', 'hope'], ['you', 'are', 'fine', 'thank', 'you']]
Using a simple iteration and str.split
Ex:
lst=[['hello', 'how'], ['are', 'you', 'hope'], ['you,are,fine', 'thank', 'you']]
result = []
for i in lst:
temp = []
for j in i:
temp += j.split(",")
result.append(temp)
print(result)
Output:
[['hello', 'how'],
['are', 'you', 'hope'],
['you', 'are', 'fine', 'thank', 'you']]
You can use this
[[z for z in x.split(',') for x in y] for y in List]

split existed list based on the repeated word

I tried to split a list into new list. Here's the initial list:
initList =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word',
'title', 'PTE427', 'how', 'are', 'you']
If I want to split the list based on the PTExyz to new list which looks:
newList = ['PTE123 I am programmer', 'PTE345 based word title', 'PTE427 how are you']
How should I develop proper algorithm for general case with repeated item PTExyz?
Thank You!
The algorithm will be something like this.
Iterate over the list. Find a the string s that starts with PTE. Assign it to a temp string which is initialized as an empty string. Add every next string s with temp unless that string starts with PTE. In that case, if the temp string is not empty then append it with your result list else add the string with temp.
ls = ['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word', 'title', 'PTE427', 'how', 'are', 'you']
result = []
temp = ''
for s in ls:
if s.startswith('PTE'):
if temp != '':
result.append(temp)
temp = s
else:
if temp == '':
continue
temp += ' ' + s
result.append(temp)
print(result)
Edit
For handling the pattern PTExyz you can use regular expression. In that case the code will be like this where the line is s.startswith('PTE'):
re.match(r'PTE\w{3}$', s)
I think it will work
l =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word','title', 'PTE427', 'how', 'are', 'you']
resultlist = []
s = ' '.join(l)
str = s.split('PTE')
for i in str:
resultlist.append('PTE'+i)
resultlist.remove('PTE')
print resultlist
It works on a regular expression PTExyz
import re
l =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word',
'title', 'PTE427', 'how', 'are', 'you']
pattern = re.compile(r'[P][T][E]\d\d\d')
k = []
for i in l:
if pattern.match(i) is not None:
k.append(i)
s = ' '.join(l)
str = re.split(pattern, s)
str.remove('')
for i in range(len(k)):
str[i] = k[i] + str[i]
print str
>>> list =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word','title', 'PTE427', 'how', 'are', 'you']
>>> index_list =[ list.index(item) for item in list if "PTE" in item]
>>> index_list.append(len(list))
>>> index_list
[0, 5, 9, 13]
>>> [' '.join(list[index_list[i-1]:index_list[i]]) for i,item in enumerate(index_list) if item > 0 ]
Output
['PTE123 I am programmer', 'PTE345 based word title', 'PTE427 how are you']

Finding duplicates in a list of a list, and adding their values

I'm trying to find the top 50 words that occur within three texts of Shakespeare and the ratio of each words occurrance in, macbeth.txt, allswell.txt, and othello.txt. Here is my code so far:
def byFreq(pair):
return pair[1]
def shakespeare():
counts = {}
A = []
for words in ['macbeth.txt','allswell.txt','othello.txt']:
text = open(words, 'r').read()
test = text.lower()
for ch in '!"$%&()*+,-./:;<=>?#[\\]^_`{|}~':
text = text.replace(ch, ' ')
words = text.split()
for w in words:
counts[w] = counts.get(w, 0) + 1
items = list(counts.items())
items.sort()
items.sort(key=byFreq, reverse = True)
for i in range(50):
word, count = items[i]
count = count / float(len(counts))
A += [[word, count]]
print A
And its output:
>>> shakespeare()
[['the', 0.12929982922664066], ['and', 0.09148572822639668], ['I', 0.08075140278116613], ['of', 0.07684801171017322], ['to', 0.07562820200048792], ['a', 0.05220785557453037], ['you', 0.04415711149060746], ['in', 0.041717492071236886], ['And', 0.04147353012929983], ['my', 0.04147353012929983], ['is', 0.03927787265186631], ['not', 0.03781410100024396], ['that', 0.0358624054647475], ['it', 0.03366674798731398], ['Macb', 0.03342278604537692], ['with', 0.03269090021956575], ['his', 0.03147109050988046], ['be', 0.03025128080019517], ['The', 0.028787509148572824], ['haue', 0.028543547206635766], ['me', 0.027079775555013418], ['your', 0.02683581361307636], ['our', 0.025128080019516955], ['him', 0.021956574774335203], ['Enter', 0.019516955354964626], ['That', 0.019516955354964626], ['for', 0.01927299341302757], ['this', 0.01927299341302757], ['he', 0.018541107587216395], ['To', 0.01780922176140522], ['so', 0.017077335935594046], ['all', 0.0156135642839717], ['What', 0.015369602342034643], ['are', 0.015369602342034643], ['thou', 0.015369602342034643], ['will', 0.015125640400097584], ['Macbeth', 0.014881678458160527], ['thee', 0.014881678458160527], ['But', 0.014637716516223469], ['but', 0.014637716516223469], ['Macd', 0.014149792632349353], ['they', 0.014149792632349353], ['their', 0.013905830690412296], ['we', 0.013905830690412296], ['as', 0.01341790680653818], ['vs', 0.01341790680653818], ['King', 0.013173944864601122], ['on', 0.013173944864601122], ['yet', 0.012198097096852892], ['Rosse', 0.011954135154915833], ['the', 0.15813168261114238], ['I', 0.14279684862127182], ['and', 0.1231007315700619], ['to', 0.10875070343275182], ['of', 0.10481148002250985], ['a', 0.08581879572312887], ['you', 0.08581879572312887], ['my', 0.06992121553179516], ['in', 0.061902082160945414], ['is', 0.05852560495216657], ['not', 0.05486775464265616], ['it', 0.05472706809229038], ['that', 0.05472706809229038], ['his', 0.04727068092290377], ['your', 0.04389420371412493], ['me', 0.043753517163759144], ['be', 0.04305008441193022], ['And', 0.04037703995498031], ['with', 0.038266741699493526], ['him', 0.037703995498030385], ['for', 0.03601575689364097], ['he', 0.03404614518851998], ['The', 0.03137310073157006], ['this', 0.030810354530106922], ['her', 0.029262802476083285], ['will', 0.0291221159257175], ['so', 0.027011817670230726], ['have', 0.02687113111986494], ['our', 0.02687113111986494], ['but', 0.024760832864378166], ['That', 0.02293190770962296], ['PAROLLES', 0.022791221159257174], ['To', 0.021384355655599326], ['all', 0.021384355655599326], ['shall', 0.021102982554867755], ['are', 0.02096229600450197], ['as', 0.02096229600450197], ['thou', 0.02039954980303883], ['Macb', 0.019274057400112548], ['thee', 0.019274057400112548], ['no', 0.01871131119864941], ['But', 0.01842993809791784], ['Enter', 0.01814856499718627], ['BERTRAM', 0.01758581879572313], ['HELENA', 0.01730444569499156], ['we', 0.01730444569499156], ['do', 0.017163759144625774], ['thy', 0.017163759144625774], ['was', 0.01674169949352842], ['haue', 0.016460326392796848], ['I', 0.19463784682531435], ['the', 0.17894627455055595], ['and', 0.1472513769094877], ['to', 0.12989712147978802], ['of', 0.12002494024732412], ['you', 0.1079704873739998], ['a', 0.10339810869791126], ['my', 0.0909279850358516], ['in', 0.07627558973293151], ['not', 0.07159929335965914], ['is', 0.0697287748103502], ['it', 0.0676504208666736], ['that', 0.06733866777512211], ['me', 0.06099968824690845], ['your', 0.0543489556271433], ['And', 0.053205860958121166], ['be', 0.05310194326093734], ['his', 0.05154317780317988], ['with', 0.04769822300737816], ['him', 0.04665904603553985], ['her', 0.04364543281720877], ['for', 0.04322976202847345], ['he', 0.042190585056635144], ['this', 0.04187883196508366], ['will', 0.035332017042502335], ['Iago', 0.03522809934531851], ['so', 0.03356541619037722], ['The', 0.03325366309882573], ['haue', 0.031902733035435935], ['do', 0.03138314454951678], ['but', 0.030240049880494647], ['That', 0.02857736672555336], ['thou', 0.027642107450898887], ['as', 0.027434272056531227], ['To', 0.026810765873428243], ['our', 0.02504416502130313], ['are', 0.024628494232567806], ['But', 0.024420658838200146], ['all', 0.024316741141016316], ['What', 0.024212823443832486], ['shall', 0.024004988049464823], ['on', 0.02265405798607503], ['thee', 0.022134469500155875], ['Enter', 0.021822716408604385], ['thy', 0.021199210225501402], ['no', 0.020783539436766082], ['she', 0.02026395095084693], ['am', 0.02005611555647927], ['by', 0.019848280162111608], ['have', 0.019848280162111608]]
Instead of outputing the top 50 words of all three texts, its outputs the top 50 words of each text, 150 words. Im struggling on trying to delete the duplicates but add their ratios together. For example, in macbeth.txt the word 'the' has a ratio of 0.12929982922664066, allswell.txt has a ratio of 0.15813168261114238, and othello.txt has a ratio of 0.17894627455055595. I want to combine the ratios of all three of them. I;m pretty sure I have to use a for loop but I'm struggling to loop through a list within a list. I am more of a java guy so any help would be appreciated!
You can use a list comprehension and the Counter-class:
from collections import Counter
c = Counter([word for file in ['macbeth.txt','allswell.txt','othello.txt']
for word in open(file).read().split()])
Then you get a dict which maps words to their counts. You can sort them like this:
sorted([(i,v) for v,i in c.items()])
If you want the relative quantities, then you can calculate the total number of words:
numWords = sum([i for (v,i) in c.items()])
and adapt the dict c via a dict-comprehension:
c = { v:(i/numWords) for (v,i) in c.items()}
You're summarizing the count inside your loop over files. Move the summary code outside your for loop.

Splitting a list using indices

I'm struggling to cut a list into pieces at certain indices. Although I'm able to do it one piece at a time, I haven't arrived at an expression that will allow me to skip doing it piecewise.
import re
# Creating list to split
list = ['Leading', 'text', 'of', 'no', 'interest', '1.', 'Here', 'begins', 'section', '1', '2.', 'This', 'is', 'section', '2', '3.', 'Now', 'we', `enter code here`'have', 'section', '3']
# Identifying where sections begin and end
section_ids = [i for i, item in enumerate(list) if re.search('[0-9]+\.(?![0-9])', item)]
# Simple creation of a new list for each section, piece by piece
section1 = list[section_ids[0]:section_ids[1]]
section2 = list[section_ids[1]:section_ids[2]]
section3 = list[section_ids[2]:]
# Iterative creation of a new list for each claim - DOES NOT WORK
for i in range(len(section_ids)):
if i < max(range(len(section_ids))):
section[i] = list[section_ids[i] : list[section_ids[i + 1]]
else:
section[i] = list[section_ids[i] : ]
print section[i]
# This is what I'd like to get
# ['1.', 'Here', 'begins', 'section', '1']
# ['2.', 'This', 'is', 'section', '2']
# ['3.', 'Now', 'we', 'have', 'section', '3']
for i,j in map(None, section_ids, section_ids[1:]):
print my_list[i:j]
itertools version will be more efficient if the section_ids is large
from itertools import izip_longest, islice
for i,j in izip_longest(section_ids, islice(section_ids, 1, None)):
print my_list[i:j]
I was able to produce the desired output with the following code:
section=[]
for i,v in enumerate(section_ids+[len(list)]):
if i==0:continue
section.append(list[section_ids[i-1]:v])
are you trying to achieve something like this:
>>> section = [] # list to hold sublists ....
>>> for index, location in enumerate(section_ids):
... if location != section_ids[-1]: # assume its not the last one
... section.append(list[location:section_ids[index + 1]])
... else:
... section.append(list[location:])
... print section[-1]
...
['1.', 'Here', 'begins', 'section', '1']
['2.', 'This', 'is', 'section', '2']
['3.', 'Now', 'we', 'have', 'section', '3']
>>>
or:
>>> import re
>>> from pprint import pprint
>>> values = ['Leading', 'text', 'of', 'no', 'interest', '1.', 'Here', 'begins', 'section', '1', '2.', 'This', 'is', 'section', '2', '3.', 'Now', 'we', 'have', 'section', '3']
>>> section_ids = [i for i, item in enumerate(values) if re.search('[0-9]+\.(?![0-9])', item)] + [len(values)]
>>> section = [values[location:section_ids[index + 1]] for index, location in enumerate(section_ids) if location != section_ids[-1]]
>>> pprint(section)
[['1.', 'Here', 'begins', 'section', '1'],
['2.', 'This', 'is', 'section', '2'],
['3.', 'Now', 'we', 'have', 'section', '3']]

Categories

Resources