data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
listi = []
for line in pref_network_find:
listi.append(i)
print (listi)
10.185.16.64 27
55.242.33.0 24
55.242.154.0 30
['55.242.154.0 30', '55.242.154.0 30', '55.242.154.0 30']
Process finished with exit code 0
Im trying to get ['55.242.154.0 30', '55.242.33.0 24', '10.185.16.64 27'] as list1 at the end, but cant understand my mistake here. Could you help me with that?
You do not need to garner the initial spliced and joined IPs with regex; instead, just use str.split():
import re
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
final_list = sorted(['{} {}'.format(b, mydict[c]) for a, b, c in [i.split() for i in data]], key=lambda x:map(int, re.split('\.|\s', x)), reverse=True)
Output:
['55.242.154.0 30', '55.242.33.0 24', '10.185.16.64 27']
Obviously, it will print 30 at the end because your this code
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
i is 30 after execution. And you are using old variable 'i' like this
for line in pref_network_find:
listi.append(i)
So yes the code is doing its job well, i is 30 and it is appending 30 to your result.
Correct code goes like this.
import re
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
listi = []
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
listi.append(i)
print (listi)
Correct me if I am wrong here, maybe you want something else, however, this is what I understood by your question.
Your code is wrong because you are appending with wrong index i at here :
for line in pref_network_find:
listi.append(i)
We have last value in i = 55.242.154.0 from previous loop. You should use line instead of i or append in for loop directly
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
listi = []
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
listi.append(i.replace(word, str(mydict[word])))
print(listi)
Related
I have many text documents that I want to compare to one another and remove all text that is exactly the same between them. This is to remove find boiler plate text that is consistent so it can be removed for NLP.
The best way I figured to do this is to find Longest Common Sub-strings that exist or are mostly present in all the documents. However, doing this has been incredibly slow.
Here is an example of what I am trying to accomplish:
DocA:
Title: To Kill a Mocking Bird
Author: Harper Lee
Published: July 11, 1960
DocB:
Title: 1984
Author: George Orwell
Published: June 1949
DocC:
Title: The Great Gatsby
Author: F. Scott Fitzgerald
The output would show something like:
{
'Title': 3,
'Author': 3,
'Published': 2,
}
The results would then be used to strip out the commonalities between documents.
Here is some code I have tested in python. It's incredibly with any significant amount of permutations:
file_perms = list(itertools.permutations(files, 2))
results = {}
for p in file_perms:
doc_a = p[0]
doc_b = p[1]
while True:
seq_match = SequenceMatcher(a=doc_a, b=doc_b)
match = seq_match.find_longest_match(0, len(doc_a), 0, len(doc_b))
if (match.size >= 5):
doc_a_start, doc_a_stop = match.a, match.a + match.size
doc_b_start, doc_b_stop = match.b, match.b + match.size
match_word = doc_a[doc_a_start:doc_a_stop]
if match_word in results:
results[match_word] += 1
else:
results[match_word] = 1
doc_a = doc_a[:doc_a_start] + doc_a[doc_a_stop:]
doc_b = doc_b[:doc_b_start] + doc_b[doc_b_stop:]
else:
break
df = pd.DataFrame(
{
'Value': [x for x in results.keys()],
'Count': [x for x in results.values()]
}
)
print(df)
create a set from each document,
build a counter for every word how many time it appears
iterate over every document, when you find a word that appears in 70% -90% of documents,
append it and the word after it as a tuple to a new counter
and again..
from collections import Counter
one_word = Counter()
for doc in docs:
word_list = docs.split(" ")
word_set = set(word_list)
for word in word_set:
one_word[word]+=1
two_word = Counter()
threshold = len(docs)*0.7
for doc in docs:
word_list = doc.split(" ")
for i in range(len(word_list)-1):
if one_word[word_list[i]]>threshold:
key = (word_list[i], word_list[i+1])
you can play with the threshold and continue as long as the counter is not empty
the docs are lyrics of songs believer, by the river of Babylon, I could stay awake, rattlin bog
from collections import Counter
import os
import glob
TR =1 #threshold
dir = r"D:\docs"
path = os.path.join(dir,"*.txt")
files = glob.glob(path)
one_word = {}
all_docs = {}
for file in files:
one_word[file] = set()
all_docs[file] = []
with open(file) as doc:
for row in doc:
for word in row.split():
one_word[file].add(word)
all_docs[file].append(word)
#now one_word is a dict where the kay is file name and the value is set of words in it
#all_docs is a dict file name is the key and the value is the complete doc stord in a list word by word
common_Frase = Counter()
for key in one_word:
for word in one_word[key]:
common_Frase[word]+=1
#common_Frase containe a count of all words appearence in all files (every file can add a word once)
two_word = {}
for key in all_docs:
two_word[key] = set()
doc = all_docs[key]
for index in range(len(doc)-1):
if common_Frase[doc[index]]>TR:
val = (doc[index], doc[index+1])
two_word[key].add(val)
for key in two_word:
for word in two_word[key]:
common_Frase[word]+=1
#now common_Frase contain a count of all two words frase
three_word = {}
for key in all_docs:
three_word[key] = set()
doc = all_docs[key]
for index in range(len(doc)-2):
val2 = (doc[index], doc[index+1])
if common_Frase[val2]>TR:
val3 = (doc[index], doc[index+1], doc[index+2])
three_word[key].add(val3)
for key in three_word:
for word in three_word[key]:
common_Frase[word]+=1
for k in common_Frase:
if common_Frase[k]>1:
print(k)
this is the outpot
when like all Don't And one the my hear and feeling Then your of I'm in me The you away I never to be what a ever thing there from By down Now words that was ('all', 'the') ('And', 'the') ('the', 'words') ('By', 'the') ('and', 'the') ('in', 'the')
I am having this text
/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/
I would like to have Alex , Dog , House and red in one list and Maria,Cat,office,green in an other list.
I am having this code
with open(filename) as f :
for i in f:
if i.startswith("/** Goodmorning"):
#add files to list
elif i.startswith("/** Goodnight"):
#add files to other list
So, is there any way to write the script so it can understands that Alex belongs in the part of the text that has Goodmorning?
I'd recommend you to use dict, where "section name" will be a key:
with open(filename) as f:
result = {}
current_list = None
for line in f:
if line.startswith("/**"):
current_list = []
result[line[3:].strip()] = current_list
elif line != "*/":
current_list.append(line.strip())
Result:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight': ['Maria', 'Cat', 'Office', 'Green']}
To search which key one of values belongs you can use next code:
search_value = "Alex"
for key, values in result.items():
if search_value in values:
print(search_value, "belongs to", key)
break
I would recommend to use Regular expressions. In python there is a module for this called re
import re
s = """/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/"""
pattern = r'/\*\*([\w \n]+)\*/'
word_groups = re.findall(pattern, s, re.MULTILINE)
d = {}
for word_group in word_groups:
words = word_group.strip().split('\n\n')
d[words[0]] = words[1:]
print(d)
Output:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight':
['Maria', 'Cat', 'Office', 'Green']}
expanding on Olvin Roght (sorry can't comment - not enough reputation) I would keep a second dictionary for the reverse lookup
with open(filename) as f:
key_to_list = {}
name_to_key = {}
current_list = None
current_key = None
for line in f:
if line.startswith("/**"):
current_list = []
current_key = line[3:].strip()
key_to_list[current_key] = current_list
elif line != "*/":
current_name=line.strip()
name_to_key[current_name]=current_key
current_list.append(current_name)
print key_to_list
print name_to_key['Alex']
alternative is to convert the dictionary afterwards:
name_to_key = {n : k for k in key_to_list for n in key_to_list[k]}
(i.e if you want to go with the regex version from ashwani)
Limitation is that this only permits one membership per name.
I'm facing quite a tricky problem in my python code. I looked around and was not able to find anyone with a similar problem.
I'd like to generate strings translating some characters into several, different ones.
I'd like that original characters, meant to be replaced (translated), to be replaced by several different ones.
What I'm looking to do is something like this :
text = "hi there"
translations = {"i":["b", "c"], "r":["e","f"]}
result = magicfunctionHere(text,translations)
print(result)
> [
"hb there",
"hc there",
"hi theee",
"hi thefe",
"hb theee",
"hb thefe",
"hc theee",
"hc thefe"
]
The result contains any combination of the original text with 'i' and 'r' replaced respectively by 'b' and 'c', and 'e' and 'f'.
I don't see how to do that, using itertools and functions like permutations, product etc...
I hope I'm clear enough, it is quite a specific problem !
Thank you for your help !
def magicfunction(ret, text, alphabet_location, translations):
if len(alphabet_location) == 0:
ret.append(text)
return ret
index = alphabet_location.pop()
for w in translations[text[index]]:
ret = magicfunction(ret, text[:index] + w + text[index + 1:], alphabet_location, translations)
alphabet_location.append(index)
return ret
def magicfunctionHere(text, translations):
alphabet_location = []
for key in translations.keys():
alphabet_location.append(text.find(key))
translations[key].append(key)
ret = []
ret = magicfunction(ret, text, alphabet_location, translations)
ret.pop()
return ret
text = "hi there"
translations = {"i":["b", "c"], "r":["e","f"]}
result = magicfunctionHere(text,translations)
print(result)
One crude way to go would be to use a Nested Loop Constructin 2 steps (Functions) as depicted in the Snippet below:
def rearrange_characters(str_text, dict_translations):
tmp_result = []
for key, value in dict_translations.items():
if key in str_text:
for replacer in value:
str_temp = str_text.replace(key, replacer, 1)
if str_temp not in tmp_result:
tmp_result.append(str_temp)
return tmp_result
def get_rearranged_characters(str_text, dict_translations):
lst_result = rearrange_characters(str_text, dict_translations)
str_joined = ','.join(lst_result)
for str_part in lst_result:
str_joined = "{},{}".format(str_joined, ','.join(rearrange_characters(str_part, dict_translations)))
return set(str_joined.split(sep=","))
text = "hi there"
translations = {"i": ["b", "c"], "r":["e","f"]}
result = get_rearranged_characters(text, translations)
print(result)
## YIELDS: {
'hb theee',
'hc thefe',
'hc there',
'hi thefe',
'hb thefe',
'hi theee',
'hc theee',
'hb there'
}
See also: https://eval.in/960803
Another equally convoluted approach would be to use a single function with nested loops like so:
def process_char_replacement(str_text, dict_translations):
tmp_result = []
for key, value in dict_translations.items():
if key in str_text:
for replacer in value:
str_temp = str_text.replace(key, replacer, 1)
if str_temp not in tmp_result:
tmp_result.append(str_temp)
str_joined = ','.join(tmp_result)
for str_part in tmp_result:
tmp_result_2 = []
for key, value in dict_translations.items():
if key in str_part:
for replacer in value:
str_temp = str_part.replace(key, replacer, 1)
if str_temp not in tmp_result_2:
tmp_result_2.append(str_temp)
str_joined = "{},{}".format(str_joined, ','.join(tmp_result_2))
return set(str_joined.split(sep=","))
text = "hi there"
translations = {"i": ["b", "c"], "r":["e","f"]}
result = process_char_replacement(text, translations)
print(result)
## YIELDS: {
'hb theee',
'hc thefe',
'hc there',
'hi thefe',
'hb thefe',
'hi theee',
'hc theee',
'hb there'
}
Refer to: https://eval.in/961602
I want to replace all occurrences of a set of strings in a text line. I came up with this approach, but I am sure there is a better way of doing this:
myDict = {}
test = re.compile(re.escape('pig'), re.IGNORECASE)
myDict['car'] = test
test = re.compile(re.escape('horse'), re.IGNORECASE)
myDict['airplane'] = test
test = re.compile(re.escape('cow'), re.IGNORECASE)
myDict['bus'] = test
mystring = 'I have this Pig and that pig with a hOrse and coW'
for key in myDict:
regex_obj = myDict[key]
mystring = regex_obj.sub(key, mystring)
print mystring
I have this car and that car with a airplane and bus
Based on #Paul Rooney's answer below, ideally I would do this:
def init_regex():
rd = {'pig': 'car', 'horse':'airplane', 'cow':'bus'}
myDict = {}
for key,value in rd.iteritems():
pattern = re.compile(re.escape(key), re.IGNORECASE)
myDict[value] = pattern
return myDict
def strrep(mystring, patternDict):
for key in patternDict:
regex_obj = patternDict[key]
mystring = regex_obj.sub(key, mystring)
return mystring
Try
import itertools
import re
mystring = 'I have this Pig and that pig with a hOrse and coW'
rd = {'pig': 'car', 'horse':'airplane', 'cow':'bus'}
cachedict = {}
def strrep(orig, repdict):
for k,v in repdict.iteritems():
if k in cachedict:
pattern = cachedict[k]
else:
pattern = re.compile(k, re.IGNORECASE)
cachedict[k] = pattern
orig = pattern.sub(v, orig)
return orig
print strrep(mystring, rd)
This answer was initially written for python2, but for python 3 you would use repdict.items instead of repdict.iteritems.
I'm starting to learn Python and I'm trying to write a program that would import a text file, count the total number of words, count the number of words in a specific paragraph (said by each participant, described by 'P1', 'P2' etc.), exclude these words (i.e. 'P1' etc.) from my word count, and print paragraphs separately.
Thanks to #James Hurford I got this code:
words = None
with open('data.txt') as f:
words = f.read().split()
total_words = len(words)
print 'Total words:', total_words
in_para = False
para_type = None
paragraph = list()
for word in words:
if ('P1' in word or
'P2' in word or
'P3' in word ):
if in_para == False:
in_para = True
para_type = word
else:
print 'Words in paragraph', para_type, ':', len(paragraph)
print ' '.join(paragraph)
del paragraph[:]
para_type = word
else:
paragraph.append(word)
else:
if in_para == True:
print 'Words in last paragraph', para_type, ':', len(paragraph)
print ' '.join(paragraph)
else:
print 'No words'
My text file looks like this:
P1: Bla bla bla.
P2: Bla bla bla bla.
P1: Bla bla.
P3: Bla.
The next part I need to do is summing up the words for each participant. I can only print them, but I don't know how to return/reuse them.
I would need a new variable with word count for each participant that I could manipulate later on, in addition to summing up all the words said by each participant, e.g.
P1all = sum of words in paragraph
Is there a way to count "you're" or "it's" etc. as two words?
Any ideas how to solve it?
I would need a new variable with word count for each participant that I could manipulate later on
No, you would need a Counter (Python 2.7+, else use a defaultdict(int)) mapping persons to word counts.
from collections import Counter
#from collections import defaultdict
words_per_person = Counter()
#words_per_person = defaultdict(int)
for ln in inputfile:
person, text = ln.split(':', 1)
words_per_person[person] += len(text.split())
Now words_per_person['P1'] contains the number of words of P1, assuming text.split() is a good enough tokenizer for your purposes. (Linguists disagree about the definition of word, so you're always going to get an approximation.)
Congrats on beginning your adventure with Python! Not everything in this post might make sense right now but bookmark it and comeback to it if it seems helpful later. Eventually you should try to move from scripting to software engineering, and here are a few ideas for you!
With great power comes great responsibility, and as a Python developer you need to be more disciplined than other languages which don't hold your hand and enforce "good" design.
I find it helps to start with a top-down design.
def main():
text = get_text()
p_text = process_text(text)
catalogue = process_catalogue(p_text)
BOOM! You just wrote the whole program -- now you just need to back and fill in the blanks! When you do it like this, it seems less intimidating. Personally, I don't consider myself smart enough to solve very big problems, but I'm a pro at solving small problems. So lets tackle one thing at a time. I'm going to start with 'process_text'.
def process_text(text):
b_text = bundle_dialogue_items(text)
f_text = filter_dialogue_items(b_text)
c_text = clean_dialogue_items(f_text)
I'm not really sure what those things mean yet, but I know that text problems tend to follow a pattern called "map/reduce" which means you perform and operation on something and then you clean it up and combine, so I put in some placeholder functions. I might go back and add more if necessary.
Now let's write 'process_catalogue'. I could've written "process_dict" but that sounded lame to me.
def process_catalogue(p_text):
speakers = make_catalogue(c_text)
s_speakers = sum_words_per_paragraph_items(speakers)
t_speakers = total_word_count(s_speakers)
Cool. Not too bad. You might approach this different than me, but I thought it would make sense to aggregate the items, the count the words per paragraph, and then count all the words.
So, at this point I'd probably make one or two little 'lib' (library) modules to back-fill the remaining functions. For the sake you being able to run this without worrying about imports, I'm going to stick it all in one .py file, but eventually you'll learn how to break these up so it looks nicer. So let's do this.
# ------------------ #
# == process_text == #
# ------------------ #
def bundle_dialogue_items(lines):
cur_speaker = None
paragraphs = Counter()
for line in lines:
if re.match(p, line):
cur_speaker, dialogue = line.split(':')
paragraphs[cur_speaker] += 1
else:
dialogue = line
res = cur_speaker, dialogue, paragraphs[cur_speaker]
yield res
def filter_dialogue_items(lines):
for name, dialogue, paragraph in lines:
if dialogue:
res = name, dialogue, paragraph
yield res
def clean_dialogue_items(flines):
for name, dialogue, paragraph in flines:
s_dialogue = dialogue.strip().split()
c_dialouge = [clean_word(w) for w in s_dialogue]
res = name, c_dialouge, paragraph
yield res
aaaand a little helper function
# ------------------- #
# == aux functions == #
# ------------------- #
to_clean = string.whitespace + string.punctuation
def clean_word(word):
res = ''.join(c for c in word if c not in to_clean)
return res
So it may not be obvious but this library is designed as a data processing pipeline. There several ways to process data, one is pipeline processing and another is batch processing. Let's take a look at batch processing.
# ----------------------- #
# == process_catalogue == #
# ----------------------- #
speaker_stats = 'stats'
def make_catalogue(names_with_dialogue):
speakers = {}
for name, dialogue, paragraph in names_with_dialogue:
speaker = speakers.setdefault(name, {})
stats = speaker.setdefault(speaker_stats, {})
stats.setdefault(paragraph, []).extend(dialogue)
return speakers
word_count = 'word_count'
def sum_words_per_paragraph_items(speakers):
for speaker in speakers:
word_stats = speakers[speaker][speaker_stats]
speakers[speaker][word_count] = Counter()
for paragraph in word_stats:
speakers[speaker][word_count][paragraph] += len(word_stats[paragraph])
return speakers
total = 'total'
def total_word_count(speakers):
for speaker in speakers:
wc = speakers[speaker][word_count]
speakers[speaker][total] = 0
for c in wc:
speakers[speaker][total] += wc[c]
return speakers
All these nested dictionaries are getting a little complicated. In actual production code I would replace these with some more readable classes (along with adding tests and docstrings!!), but I don't want to make this more confusing than it already is! Alright, for your convenience below is the whole thing put together.
import pprint
import re
import string
from collections import Counter
p = re.compile(r'(\w+?):')
def get_text_line_items(text):
for line in text.split('\n'):
yield line
def bundle_dialogue_items(lines):
cur_speaker = None
paragraphs = Counter()
for line in lines:
if re.match(p, line):
cur_speaker, dialogue = line.split(':')
paragraphs[cur_speaker] += 1
else:
dialogue = line
res = cur_speaker, dialogue, paragraphs[cur_speaker]
yield res
def filter_dialogue_items(lines):
for name, dialogue, paragraph in lines:
if dialogue:
res = name, dialogue, paragraph
yield res
to_clean = string.whitespace + string.punctuation
def clean_word(word):
res = ''.join(c for c in word if c not in to_clean)
return res
def clean_dialogue_items(flines):
for name, dialogue, paragraph in flines:
s_dialogue = dialogue.strip().split()
c_dialouge = [clean_word(w) for w in s_dialogue]
res = name, c_dialouge, paragraph
yield res
speaker_stats = 'stats'
def make_catalogue(names_with_dialogue):
speakers = {}
for name, dialogue, paragraph in names_with_dialogue:
speaker = speakers.setdefault(name, {})
stats = speaker.setdefault(speaker_stats, {})
stats.setdefault(paragraph, []).extend(dialogue)
return speakers
def clean_dict(speakers):
for speaker in speakers:
stats = speakers[speaker][speaker_stats]
for paragraph in stats:
stats[paragraph] = [''.join(c for c in word if c not in to_clean)
for word in stats[paragraph]]
return speakers
word_count = 'word_count'
def sum_words_per_paragraph_items(speakers):
for speaker in speakers:
word_stats = speakers[speaker][speaker_stats]
speakers[speaker][word_count] = Counter()
for paragraph in word_stats:
speakers[speaker][word_count][paragraph] += len(word_stats[paragraph])
return speakers
total = 'total'
def total_word_count(speakers):
for speaker in speakers:
wc = speakers[speaker][word_count]
speakers[speaker][total] = 0
for c in wc:
speakers[speaker][total] += wc[c]
return speakers
def get_text():
text = '''BOB: blah blah blah blah
blah hello goodbye etc.
JERRY:.............................................
...............
BOB:blah blah blah
blah blah blah
blah.
BOB: boopy doopy doop
P1: Bla bla bla.
P2: Bla bla bla bla.
P1: Bla bla.
P3: Bla.'''
text = get_text_line_items(text)
return text
def process_catalogue(c_text):
speakers = make_catalogue(c_text)
s_speakers = sum_words_per_paragraph_items(speakers)
t_speakers = total_word_count(s_speakers)
return t_speakers
def process_text(text):
b_text = bundle_dialogue_items(text)
f_text = filter_dialogue_items(b_text)
c_text = clean_dialogue_items(f_text)
return c_text
def main():
text = get_text()
c_text = process_text(text)
t_speakers = process_catalogue(c_text)
# take a look at your hard work!
pprint.pprint(t_speakers)
if __name__ == '__main__':
main()
So this script is almost certainly overkill for this application, but the point is to see what (questionably) readable, maintainable, modular Python code might look like.
Pretty sure output looks something like:
{'BOB': {'stats': {1: ['blah',
'blah',
'blah',
'blah',
'blah',
'hello',
'goodbye',
'etc'],
2: ['blah',
'blah',
'blah',
'blah',
'blah',
'blah',
'blah'],
3: ['boopy', 'doopy', 'doop']},
'total': 18,
'word_count': Counter({1: 8, 2: 7, 3: 3})},
'JERRY': {'stats': {1: ['', '']}, 'total': 2, 'word_count': Counter({1: 2})},
'P1': {'stats': {1: ['Bla', 'bla', 'bla'], 2: ['Bla', 'bla']},
'total': 5,
'word_count': Counter({1: 3, 2: 2})},
'P2': {'stats': {1: ['Bla', 'bla', 'bla', 'bla']},
'total': 4,
'word_count': Counter({1: 4})},
'P3': {'stats': {1: ['Bla']}, 'total': 1, 'word_count': Counter({1: 1})}}
You can do this with two variables. One to keep track of what person is speaking, the other to keep the paragraphs for the persons speaking. For storing the paragraphs and associating who it is that the paragraph belongs to use a dict with the person as the key and a list of paragraphs that person said associated with this key.
para_dict = dict()
para_type = None
for word in words:
if ('P1' in word or
'P2' in word or
'P3' in word ):
#extract the part we want leaving off the ':'
para_type = word[:2]
#create a dict with a list of lists
#to contain each paragraph the person uses
if para_type not in para_dict:
para_dict[para_type] = list()
para_dict[para_type].append(list())
else:
#Append the word to the last list in the list of lists
para_dict[para_type][-1].append(word)
From here you can sum up the number of words spoken thus
for person, para_list in para_dict.items():
counts_list = list()
for para in para_list:
counts_list.append(len(para))
print person, 'spoke', sum(counts_list), 'words'