I'm facing quite a tricky problem in my python code. I looked around and was not able to find anyone with a similar problem.
I'd like to generate strings translating some characters into several, different ones.
I'd like that original characters, meant to be replaced (translated), to be replaced by several different ones.
What I'm looking to do is something like this :
text = "hi there"
translations = {"i":["b", "c"], "r":["e","f"]}
result = magicfunctionHere(text,translations)
print(result)
> [
"hb there",
"hc there",
"hi theee",
"hi thefe",
"hb theee",
"hb thefe",
"hc theee",
"hc thefe"
]
The result contains any combination of the original text with 'i' and 'r' replaced respectively by 'b' and 'c', and 'e' and 'f'.
I don't see how to do that, using itertools and functions like permutations, product etc...
I hope I'm clear enough, it is quite a specific problem !
Thank you for your help !
def magicfunction(ret, text, alphabet_location, translations):
if len(alphabet_location) == 0:
ret.append(text)
return ret
index = alphabet_location.pop()
for w in translations[text[index]]:
ret = magicfunction(ret, text[:index] + w + text[index + 1:], alphabet_location, translations)
alphabet_location.append(index)
return ret
def magicfunctionHere(text, translations):
alphabet_location = []
for key in translations.keys():
alphabet_location.append(text.find(key))
translations[key].append(key)
ret = []
ret = magicfunction(ret, text, alphabet_location, translations)
ret.pop()
return ret
text = "hi there"
translations = {"i":["b", "c"], "r":["e","f"]}
result = magicfunctionHere(text,translations)
print(result)
One crude way to go would be to use a Nested Loop Constructin 2 steps (Functions) as depicted in the Snippet below:
def rearrange_characters(str_text, dict_translations):
tmp_result = []
for key, value in dict_translations.items():
if key in str_text:
for replacer in value:
str_temp = str_text.replace(key, replacer, 1)
if str_temp not in tmp_result:
tmp_result.append(str_temp)
return tmp_result
def get_rearranged_characters(str_text, dict_translations):
lst_result = rearrange_characters(str_text, dict_translations)
str_joined = ','.join(lst_result)
for str_part in lst_result:
str_joined = "{},{}".format(str_joined, ','.join(rearrange_characters(str_part, dict_translations)))
return set(str_joined.split(sep=","))
text = "hi there"
translations = {"i": ["b", "c"], "r":["e","f"]}
result = get_rearranged_characters(text, translations)
print(result)
## YIELDS: {
'hb theee',
'hc thefe',
'hc there',
'hi thefe',
'hb thefe',
'hi theee',
'hc theee',
'hb there'
}
See also: https://eval.in/960803
Another equally convoluted approach would be to use a single function with nested loops like so:
def process_char_replacement(str_text, dict_translations):
tmp_result = []
for key, value in dict_translations.items():
if key in str_text:
for replacer in value:
str_temp = str_text.replace(key, replacer, 1)
if str_temp not in tmp_result:
tmp_result.append(str_temp)
str_joined = ','.join(tmp_result)
for str_part in tmp_result:
tmp_result_2 = []
for key, value in dict_translations.items():
if key in str_part:
for replacer in value:
str_temp = str_part.replace(key, replacer, 1)
if str_temp not in tmp_result_2:
tmp_result_2.append(str_temp)
str_joined = "{},{}".format(str_joined, ','.join(tmp_result_2))
return set(str_joined.split(sep=","))
text = "hi there"
translations = {"i": ["b", "c"], "r":["e","f"]}
result = process_char_replacement(text, translations)
print(result)
## YIELDS: {
'hb theee',
'hc thefe',
'hc there',
'hi thefe',
'hb thefe',
'hi theee',
'hc theee',
'hb there'
}
Refer to: https://eval.in/961602
Related
using cosine similarity I am trying to find the semantic word comparison. I have posted the code below for reference, in the code, I have added the stopwords, which are the words I don't want to be found during the search. I have opened the text file with which I want the reference words (also given below ) to be compared. I am also adding a limit of 3 words to the search means any word less than three characters is to be considered a stop word. while running the code it is giving me the process finished with exit code 0 and I can't get an output from the code. Would really appreciate some help. Thank you in advance.
import math
import re
stopwords = set (["is", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost",
"alonll", "with", "within", "without", "would", "yet", "you", "your",
"yours", "yourself", "yourselves", "the"])
with open("ref.txt", "r") as f:
lines = f.readlines()
def build_frequency_vector(content: str) -> dict[str, int]:
vector = {}
word_seq = re.split("[ ,;.!?]+", content)
for words in word_seq:
if words not in stopwords and len(words) >= 3:
words = words.lower()
if words in vector:
vector[words] = vector[words] + 1
else:
vector[words] = 1
return vector
refWords = ['spain', 'anchovy',
'france', 'internet', 'china', 'mexico', 'fish', 'industry', 'agriculture', 'fishery', 'tuna', 'transport',
'italy', 'web', 'communication', 'labour', 'fish', 'cod']
refWordsDict = {}
for refWord in refWords:
refWordsDict[refWord] = {}
for line in lines:
line = line.lower()
temp = build_frequency_vector(line)
if refWord not in temp:
continue
for word in temp:
if word not in stopwords and len(word) >= 3 and word != refWord:
refWordsDict[refWord][word] = refWordsDict[refWord].get(word, 0) + temp[word]
def product(v1: dict[str, int], v2: dict[str, int]) -> float:
sp = 0.0
for word in v1:
sp += v1[word] * v2.get(word, 0)
return sp
def cosineSimilarity(s1: str, s2: str) -> float :
d1 = build_frequency_vector(word1)
d2 = build_frequency_vector(word2)
return product(d1, d2) / (math.sqrt(product(d1, d1) * product(d2, d2)))
bests = {}
for word1 in refWords:
bestSimilarity = 0
for word2 in refWords:
if word1 != word2:
similarity: float = cosineSimilarity(refWordsDict[word1], refWordsDict[word2])
if similarity > bestSimilarity:
bestSimilarity = similarity
bests[word1] = (word2, bestSimilarity)
for item in bests:
print(item, "->", bests[item])
I am very new to python and not able to find a solution.
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
listi = []
for line in pref_network_find:
listi.append(i)
print (listi)
10.185.16.64 27
55.242.33.0 24
55.242.154.0 30
['55.242.154.0 30', '55.242.154.0 30', '55.242.154.0 30']
Process finished with exit code 0
Im trying to get ['55.242.154.0 30', '55.242.33.0 24', '10.185.16.64 27'] as list1 at the end, but cant understand my mistake here. Could you help me with that?
You do not need to garner the initial spliced and joined IPs with regex; instead, just use str.split():
import re
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
final_list = sorted(['{} {}'.format(b, mydict[c]) for a, b, c in [i.split() for i in data]], key=lambda x:map(int, re.split('\.|\s', x)), reverse=True)
Output:
['55.242.154.0 30', '55.242.33.0 24', '10.185.16.64 27']
Obviously, it will print 30 at the end because your this code
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
i is 30 after execution. And you are using old variable 'i' like this
for line in pref_network_find:
listi.append(i)
So yes the code is doing its job well, i is 30 and it is appending 30 to your result.
Correct code goes like this.
import re
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
listi = []
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
listi.append(i)
print (listi)
Correct me if I am wrong here, maybe you want something else, however, this is what I understood by your question.
Your code is wrong because you are appending with wrong index i at here :
for line in pref_network_find:
listi.append(i)
We have last value in i = 55.242.154.0 from previous loop. You should use line instead of i or append in for loop directly
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
listi = []
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
listi.append(i.replace(word, str(mydict[word])))
print(listi)
I know there are many questions with the same title. My situation is a little different. I have a string like:
"Cat(Money(8)Points(80)Friends(Online(0)Offline(8)Total(8)))Mouse(Money(10)Points(10000)Friends(Online(10)Offline(80)Total(90)))"
(Notice that there are parenthesis nested inside another)
and I need to parse it into nested dictionaries like for example:
d["Cat"]["Money"] == 8
d["Cat"]["Points"] = 80
d["Mouse"]["Friends"]["Online"] == 10
and so on. I would like to do this without libraries and regex. If you choose to use these, please explain the code in great detail.
Thanks in advance!
Edit:
Although this code will not make any sense, this is what I have so far:
o_str = "Jake(Money(8)Points(80)Friends(Online(0)Offline(8)Total(8)))Mouse(Money(10)Points(10000)Friends(Online(10)Offline(80)Total(90)))"
spl = o_str.split("(")
def reverseIndex(str1, str2):
try:
return len(str1) - str1.rindex(str2)
except Exception:
return len(str1)
def app(arr,end):
new_arr = []
for i in range(0,len(arr)):
if i < len(arr)-1:
new_arr.append(arr[i]+end)
else:
new_arr.append(arr[i])
return new_arr
spl = app(spl,"(")
ends = []
end_words = []
op = 0
cl = 0
for i in range(0,len(spl)):
print i
cl += spl[i].count(")")
op += 1
if cl == op-1:
ends.append(i)
end_words.append(spl[i])
#break
print op
print cl
print
print end_words
The end words are the sections at the beginning of each statement. I plan on using recursive to do the rest.
Now that was interesting. You really nerd-sniped me on this one...
def parse(tokens):
""" take iterator of tokens, parse to dictionary or atom """
dictionary = {}
# iterate tokens...
for token in tokens:
if token == ")" or next(tokens) == ")":
# token is ')' -> end of dict; next is ')' -> 'leaf'
break
# add sub-parse to dictionary
dictionary[token] = parse(tokens)
# return dict, if non-empty, else token
return dictionary or int(token)
Setup and demo:
>>> s = "Cat(Money(8)Points(80)Friends(Online(0)Offline(8)Total(8)))Mouse(Money(10)Points(10000)Friends(Online(10)Offline(80)Total(90)))"
>>> tokens = iter(s.replace("(", " ( ").replace(")", " ) ").split())
>>> pprint(parse(tokens))
{'Cat': {'Friends': {'Offline': 8, 'Online': 0, 'Total': 8},
'Money': 8,
'Points': 80},
'Mouse': {'Friends': {'Offline': 80, 'Online': 10, 'Total': 90},
'Money': 10,
'Points': 10000}}
Alternatively, you could also use a series of string replacements to turn that string into an actual Python dictionary string and then evaluate that, e.g. like this:
as_dict = eval("{'" + s.replace(")", "'}, ")
.replace("(", "': {'")
.replace(", ", ", '")
.replace(", ''", "")[:-3] + "}")
This will wrap the 'leafs' in singleton sets of strings, e.g. {'8'} instead of 8, but this should be easy to fix in a post-processing step.
I am quiet new to regular expressions. I have a string that looks like this:
str = "abc/def/([default], [testing])"
and a dictionary
dict = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
and using Python RE, I want str in this form, after comparisons of each element in dict to str:
str = "abc/def/(2.7, 2.1)"
Any help how to do it using Python RE?
P.S. its not the part of any assignment, instead it is the part of my project at work and I have spent many hours to figure out solution but in vain.
import re
st = "abc/def/([default], [testing], [something])"
dic = {'abc/def/[default]' : '2.7',
'abc/def/[testing]' : '2.1',
'bcd/xed/[something]' : '3.1'}
prefix_regex = "^[\w*/]*"
tag_regex = "\[\w*\]"
prefix = re.findall(prefix_regex, st)[0]
tags = re.findall(tag_regex, st)
for key in dic:
key_prefix = re.findall(prefix_regex, key)[0]
key_tag = re.findall(tag_regex, key)[0]
if prefix == key_prefix:
for tag in tags:
if tag == key_tag:
st = st.replace(tag, dic[key])
print st
OUTPUT:
abc/def/(2.7, 2.1, [something])
Here is a solution using re module.
Hypotheses :
there is a dictionary whose keys are composed of a prefix and a variable part, the variable part is enclosed in brackets ([])
the values are strings by which the variable parts are to be replaced in the string
the string is composed by a prefix, a (, a list of variable parts and a )
the variable parts in the string are enclosed in []
the variable parts in the string are separated by a comma followed by optional spaces
Python code :
import re
class splitter:
pref = re.compile("[^(]+")
iden = re.compile("\[[^]]*\]")
def __init__(self, d):
self.d = d
def split(self, s):
m = self.pref.match(s)
if m is not None:
p = m.group(0)
elts = self.iden.findall(s, m.span()[1])
return p, elts
return None
def convert(self, s):
p, elts = self.split(s)
return p + "(" + ", ".join((self.d[p + elt] for elt in elts)) + ")"
Usage :
s = "abc/def/([default], [testing])"
d = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
sp = splitter(d)
print(sp.convert(s))
output :
abc/def/(2.7, 2.1)
Regex is probably not required here. Hope this helps
lhs,rhs = str.split("/(")
rhs1,rhs2 = rhs.strip(")").split(", ")
lhs+="/"
print "{0}({1},{2})".format(lhs,dict[lhs+rhs1],dict[lhs+rhs2])
output
abc/def/(2.7,2.1)
I'm starting to learn Python and I'm trying to write a program that would import a text file, count the total number of words, count the number of words in a specific paragraph (said by each participant, described by 'P1', 'P2' etc.), exclude these words (i.e. 'P1' etc.) from my word count, and print paragraphs separately.
Thanks to #James Hurford I got this code:
words = None
with open('data.txt') as f:
words = f.read().split()
total_words = len(words)
print 'Total words:', total_words
in_para = False
para_type = None
paragraph = list()
for word in words:
if ('P1' in word or
'P2' in word or
'P3' in word ):
if in_para == False:
in_para = True
para_type = word
else:
print 'Words in paragraph', para_type, ':', len(paragraph)
print ' '.join(paragraph)
del paragraph[:]
para_type = word
else:
paragraph.append(word)
else:
if in_para == True:
print 'Words in last paragraph', para_type, ':', len(paragraph)
print ' '.join(paragraph)
else:
print 'No words'
My text file looks like this:
P1: Bla bla bla.
P2: Bla bla bla bla.
P1: Bla bla.
P3: Bla.
The next part I need to do is summing up the words for each participant. I can only print them, but I don't know how to return/reuse them.
I would need a new variable with word count for each participant that I could manipulate later on, in addition to summing up all the words said by each participant, e.g.
P1all = sum of words in paragraph
Is there a way to count "you're" or "it's" etc. as two words?
Any ideas how to solve it?
I would need a new variable with word count for each participant that I could manipulate later on
No, you would need a Counter (Python 2.7+, else use a defaultdict(int)) mapping persons to word counts.
from collections import Counter
#from collections import defaultdict
words_per_person = Counter()
#words_per_person = defaultdict(int)
for ln in inputfile:
person, text = ln.split(':', 1)
words_per_person[person] += len(text.split())
Now words_per_person['P1'] contains the number of words of P1, assuming text.split() is a good enough tokenizer for your purposes. (Linguists disagree about the definition of word, so you're always going to get an approximation.)
Congrats on beginning your adventure with Python! Not everything in this post might make sense right now but bookmark it and comeback to it if it seems helpful later. Eventually you should try to move from scripting to software engineering, and here are a few ideas for you!
With great power comes great responsibility, and as a Python developer you need to be more disciplined than other languages which don't hold your hand and enforce "good" design.
I find it helps to start with a top-down design.
def main():
text = get_text()
p_text = process_text(text)
catalogue = process_catalogue(p_text)
BOOM! You just wrote the whole program -- now you just need to back and fill in the blanks! When you do it like this, it seems less intimidating. Personally, I don't consider myself smart enough to solve very big problems, but I'm a pro at solving small problems. So lets tackle one thing at a time. I'm going to start with 'process_text'.
def process_text(text):
b_text = bundle_dialogue_items(text)
f_text = filter_dialogue_items(b_text)
c_text = clean_dialogue_items(f_text)
I'm not really sure what those things mean yet, but I know that text problems tend to follow a pattern called "map/reduce" which means you perform and operation on something and then you clean it up and combine, so I put in some placeholder functions. I might go back and add more if necessary.
Now let's write 'process_catalogue'. I could've written "process_dict" but that sounded lame to me.
def process_catalogue(p_text):
speakers = make_catalogue(c_text)
s_speakers = sum_words_per_paragraph_items(speakers)
t_speakers = total_word_count(s_speakers)
Cool. Not too bad. You might approach this different than me, but I thought it would make sense to aggregate the items, the count the words per paragraph, and then count all the words.
So, at this point I'd probably make one or two little 'lib' (library) modules to back-fill the remaining functions. For the sake you being able to run this without worrying about imports, I'm going to stick it all in one .py file, but eventually you'll learn how to break these up so it looks nicer. So let's do this.
# ------------------ #
# == process_text == #
# ------------------ #
def bundle_dialogue_items(lines):
cur_speaker = None
paragraphs = Counter()
for line in lines:
if re.match(p, line):
cur_speaker, dialogue = line.split(':')
paragraphs[cur_speaker] += 1
else:
dialogue = line
res = cur_speaker, dialogue, paragraphs[cur_speaker]
yield res
def filter_dialogue_items(lines):
for name, dialogue, paragraph in lines:
if dialogue:
res = name, dialogue, paragraph
yield res
def clean_dialogue_items(flines):
for name, dialogue, paragraph in flines:
s_dialogue = dialogue.strip().split()
c_dialouge = [clean_word(w) for w in s_dialogue]
res = name, c_dialouge, paragraph
yield res
aaaand a little helper function
# ------------------- #
# == aux functions == #
# ------------------- #
to_clean = string.whitespace + string.punctuation
def clean_word(word):
res = ''.join(c for c in word if c not in to_clean)
return res
So it may not be obvious but this library is designed as a data processing pipeline. There several ways to process data, one is pipeline processing and another is batch processing. Let's take a look at batch processing.
# ----------------------- #
# == process_catalogue == #
# ----------------------- #
speaker_stats = 'stats'
def make_catalogue(names_with_dialogue):
speakers = {}
for name, dialogue, paragraph in names_with_dialogue:
speaker = speakers.setdefault(name, {})
stats = speaker.setdefault(speaker_stats, {})
stats.setdefault(paragraph, []).extend(dialogue)
return speakers
word_count = 'word_count'
def sum_words_per_paragraph_items(speakers):
for speaker in speakers:
word_stats = speakers[speaker][speaker_stats]
speakers[speaker][word_count] = Counter()
for paragraph in word_stats:
speakers[speaker][word_count][paragraph] += len(word_stats[paragraph])
return speakers
total = 'total'
def total_word_count(speakers):
for speaker in speakers:
wc = speakers[speaker][word_count]
speakers[speaker][total] = 0
for c in wc:
speakers[speaker][total] += wc[c]
return speakers
All these nested dictionaries are getting a little complicated. In actual production code I would replace these with some more readable classes (along with adding tests and docstrings!!), but I don't want to make this more confusing than it already is! Alright, for your convenience below is the whole thing put together.
import pprint
import re
import string
from collections import Counter
p = re.compile(r'(\w+?):')
def get_text_line_items(text):
for line in text.split('\n'):
yield line
def bundle_dialogue_items(lines):
cur_speaker = None
paragraphs = Counter()
for line in lines:
if re.match(p, line):
cur_speaker, dialogue = line.split(':')
paragraphs[cur_speaker] += 1
else:
dialogue = line
res = cur_speaker, dialogue, paragraphs[cur_speaker]
yield res
def filter_dialogue_items(lines):
for name, dialogue, paragraph in lines:
if dialogue:
res = name, dialogue, paragraph
yield res
to_clean = string.whitespace + string.punctuation
def clean_word(word):
res = ''.join(c for c in word if c not in to_clean)
return res
def clean_dialogue_items(flines):
for name, dialogue, paragraph in flines:
s_dialogue = dialogue.strip().split()
c_dialouge = [clean_word(w) for w in s_dialogue]
res = name, c_dialouge, paragraph
yield res
speaker_stats = 'stats'
def make_catalogue(names_with_dialogue):
speakers = {}
for name, dialogue, paragraph in names_with_dialogue:
speaker = speakers.setdefault(name, {})
stats = speaker.setdefault(speaker_stats, {})
stats.setdefault(paragraph, []).extend(dialogue)
return speakers
def clean_dict(speakers):
for speaker in speakers:
stats = speakers[speaker][speaker_stats]
for paragraph in stats:
stats[paragraph] = [''.join(c for c in word if c not in to_clean)
for word in stats[paragraph]]
return speakers
word_count = 'word_count'
def sum_words_per_paragraph_items(speakers):
for speaker in speakers:
word_stats = speakers[speaker][speaker_stats]
speakers[speaker][word_count] = Counter()
for paragraph in word_stats:
speakers[speaker][word_count][paragraph] += len(word_stats[paragraph])
return speakers
total = 'total'
def total_word_count(speakers):
for speaker in speakers:
wc = speakers[speaker][word_count]
speakers[speaker][total] = 0
for c in wc:
speakers[speaker][total] += wc[c]
return speakers
def get_text():
text = '''BOB: blah blah blah blah
blah hello goodbye etc.
JERRY:.............................................
...............
BOB:blah blah blah
blah blah blah
blah.
BOB: boopy doopy doop
P1: Bla bla bla.
P2: Bla bla bla bla.
P1: Bla bla.
P3: Bla.'''
text = get_text_line_items(text)
return text
def process_catalogue(c_text):
speakers = make_catalogue(c_text)
s_speakers = sum_words_per_paragraph_items(speakers)
t_speakers = total_word_count(s_speakers)
return t_speakers
def process_text(text):
b_text = bundle_dialogue_items(text)
f_text = filter_dialogue_items(b_text)
c_text = clean_dialogue_items(f_text)
return c_text
def main():
text = get_text()
c_text = process_text(text)
t_speakers = process_catalogue(c_text)
# take a look at your hard work!
pprint.pprint(t_speakers)
if __name__ == '__main__':
main()
So this script is almost certainly overkill for this application, but the point is to see what (questionably) readable, maintainable, modular Python code might look like.
Pretty sure output looks something like:
{'BOB': {'stats': {1: ['blah',
'blah',
'blah',
'blah',
'blah',
'hello',
'goodbye',
'etc'],
2: ['blah',
'blah',
'blah',
'blah',
'blah',
'blah',
'blah'],
3: ['boopy', 'doopy', 'doop']},
'total': 18,
'word_count': Counter({1: 8, 2: 7, 3: 3})},
'JERRY': {'stats': {1: ['', '']}, 'total': 2, 'word_count': Counter({1: 2})},
'P1': {'stats': {1: ['Bla', 'bla', 'bla'], 2: ['Bla', 'bla']},
'total': 5,
'word_count': Counter({1: 3, 2: 2})},
'P2': {'stats': {1: ['Bla', 'bla', 'bla', 'bla']},
'total': 4,
'word_count': Counter({1: 4})},
'P3': {'stats': {1: ['Bla']}, 'total': 1, 'word_count': Counter({1: 1})}}
You can do this with two variables. One to keep track of what person is speaking, the other to keep the paragraphs for the persons speaking. For storing the paragraphs and associating who it is that the paragraph belongs to use a dict with the person as the key and a list of paragraphs that person said associated with this key.
para_dict = dict()
para_type = None
for word in words:
if ('P1' in word or
'P2' in word or
'P3' in word ):
#extract the part we want leaving off the ':'
para_type = word[:2]
#create a dict with a list of lists
#to contain each paragraph the person uses
if para_type not in para_dict:
para_dict[para_type] = list()
para_dict[para_type].append(list())
else:
#Append the word to the last list in the list of lists
para_dict[para_type][-1].append(word)
From here you can sum up the number of words spoken thus
for person, para_list in para_dict.items():
counts_list = list()
for para in para_list:
counts_list.append(len(para))
print person, 'spoke', sum(counts_list), 'words'