Python Case Insensitive Replace All of multiple strings - python

I want to replace all occurrences of a set of strings in a text line. I came up with this approach, but I am sure there is a better way of doing this:
myDict = {}
test = re.compile(re.escape('pig'), re.IGNORECASE)
myDict['car'] = test
test = re.compile(re.escape('horse'), re.IGNORECASE)
myDict['airplane'] = test
test = re.compile(re.escape('cow'), re.IGNORECASE)
myDict['bus'] = test
mystring = 'I have this Pig and that pig with a hOrse and coW'
for key in myDict:
regex_obj = myDict[key]
mystring = regex_obj.sub(key, mystring)
print mystring
I have this car and that car with a airplane and bus
Based on #Paul Rooney's answer below, ideally I would do this:
def init_regex():
rd = {'pig': 'car', 'horse':'airplane', 'cow':'bus'}
myDict = {}
for key,value in rd.iteritems():
pattern = re.compile(re.escape(key), re.IGNORECASE)
myDict[value] = pattern
return myDict
def strrep(mystring, patternDict):
for key in patternDict:
regex_obj = patternDict[key]
mystring = regex_obj.sub(key, mystring)
return mystring

Try
import itertools
import re
mystring = 'I have this Pig and that pig with a hOrse and coW'
rd = {'pig': 'car', 'horse':'airplane', 'cow':'bus'}
cachedict = {}
def strrep(orig, repdict):
for k,v in repdict.iteritems():
if k in cachedict:
pattern = cachedict[k]
else:
pattern = re.compile(k, re.IGNORECASE)
cachedict[k] = pattern
orig = pattern.sub(v, orig)
return orig
print strrep(mystring, rd)
This answer was initially written for python2, but for python 3 you would use repdict.items instead of repdict.iteritems.

Related

add values to a list from specific part of a text file

I am having this text
/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/
I would like to have Alex , Dog , House and red in one list and Maria,Cat,office,green in an other list.
I am having this code
with open(filename) as f :
for i in f:
if i.startswith("/** Goodmorning"):
#add files to list
elif i.startswith("/** Goodnight"):
#add files to other list
So, is there any way to write the script so it can understands that Alex belongs in the part of the text that has Goodmorning?
I'd recommend you to use dict, where "section name" will be a key:
with open(filename) as f:
result = {}
current_list = None
for line in f:
if line.startswith("/**"):
current_list = []
result[line[3:].strip()] = current_list
elif line != "*/":
current_list.append(line.strip())
Result:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight': ['Maria', 'Cat', 'Office', 'Green']}
To search which key one of values belongs you can use next code:
search_value = "Alex"
for key, values in result.items():
if search_value in values:
print(search_value, "belongs to", key)
break
I would recommend to use Regular expressions. In python there is a module for this called re
import re
s = """/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/"""
pattern = r'/\*\*([\w \n]+)\*/'
word_groups = re.findall(pattern, s, re.MULTILINE)
d = {}
for word_group in word_groups:
words = word_group.strip().split('\n\n')
d[words[0]] = words[1:]
print(d)
Output:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight':
['Maria', 'Cat', 'Office', 'Green']}
expanding on Olvin Roght (sorry can't comment - not enough reputation) I would keep a second dictionary for the reverse lookup
with open(filename) as f:
key_to_list = {}
name_to_key = {}
current_list = None
current_key = None
for line in f:
if line.startswith("/**"):
current_list = []
current_key = line[3:].strip()
key_to_list[current_key] = current_list
elif line != "*/":
current_name=line.strip()
name_to_key[current_name]=current_key
current_list.append(current_name)
print key_to_list
print name_to_key['Alex']
alternative is to convert the dictionary afterwards:
name_to_key = {n : k for k in key_to_list for n in key_to_list[k]}
(i.e if you want to go with the regex version from ashwani)
Limitation is that this only permits one membership per name.

python replace content of url between <br/> and <br/>

There's a string like this:
<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>
I want to delete the content:
https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0
Just keep:
<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>
My code:
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
How to do it?
You can use re.sub from regex module:
import re
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
print(re.sub(r'http[^<]+', '', mystring))
Output:
<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>
You can do this with regex replace:
Find: <br/>https?://[^<]*</br>
Replace: <br/></br>
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
# remove 'https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0'
resultstring = '<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>'
length = len(mystring)
startPos = -1
endPos = -1
for i in range(length):
subString = mystring[i:]
if subString.startswith('<br/>'):
if(startPos == -1):
startPos = i
continue # check from next character to get endPos
if(endPos == -1):
endPos = i
firstSubString = mystring[:startPos + 5] # 5 = the characher size of '<br/>'
lastSubString = mystring[endPos:]
completeResult = firstSubString + lastSubString
print(completeResult, completeResult == resultstring)
print(completeResult, resultstring)
import re
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-
net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
print(re.sub("(?:<br/>https)([\s\S]*?)(?=<br/>)",'<br/>',mystring))
Output:
<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>

Making a list from a loop output

data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
listi = []
for line in pref_network_find:
listi.append(i)
print (listi)
10.185.16.64 27
55.242.33.0 24
55.242.154.0 30
['55.242.154.0 30', '55.242.154.0 30', '55.242.154.0 30']
Process finished with exit code 0
Im trying to get ['55.242.154.0 30', '55.242.33.0 24', '10.185.16.64 27'] as list1 at the end, but cant understand my mistake here. Could you help me with that?
You do not need to garner the initial spliced and joined IPs with regex; instead, just use str.split():
import re
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
final_list = sorted(['{} {}'.format(b, mydict[c]) for a, b, c in [i.split() for i in data]], key=lambda x:map(int, re.split('\.|\s', x)), reverse=True)
Output:
['55.242.154.0 30', '55.242.33.0 24', '10.185.16.64 27']
Obviously, it will print 30 at the end because your this code
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
i is 30 after execution. And you are using old variable 'i' like this
for line in pref_network_find:
listi.append(i)
So yes the code is doing its job well, i is 30 and it is appending 30 to your result.
Correct code goes like this.
import re
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
listi = []
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
i = i.replace(word,str(mydict[word]))
pref = print (i)
listi.append(i)
print (listi)
Correct me if I am wrong here, maybe you want something else, however, this is what I understood by your question.
Your code is wrong because you are appending with wrong index i at here :
for line in pref_network_find:
listi.append(i)
We have last value in i = 55.242.154.0 from previous loop. You should use line instead of i or append in for loop directly
data = ['network 10.185.16.64 255.255.255.224','network 55.242.33.0 255.255.255.0','network 55.242.154.0 255.255.255.252']
pref_network_find = re.findall('(\S+\s+255.255.255.\w+)',str(data))
mydict = {"255.255.255.0":24,"255.255.255.128":25,"255.255.255.192":26,"255.255.255.224":27,"255.255.255.240":28,"255.255.255.248":29,"255.255.255.252":30}
listi = []
for i in pref_network_find:
splitlines = i.split()
for word in splitlines:
if word in mydict:
listi.append(i.replace(word, str(mydict[word])))
print(listi)

Join whole word by its Tag Python

let say i have this sentences:
His/O name/O is/O Petter/Name Jack/Name and/O his/O brother/O name/O is/O
Jonas/Name Van/Name Dame/Name
How can i get result like this:
Petter Jack, Jonas Van Dame.
So far i've already tried this, but still its just join 2 word :
import re
pattern = re.compile(r"\w+\/Name)
sent = sentence.split()
for i , w in sent:
if pattern.match(sent[i]) != None:
if pattern.match(sent[i+1]) != None:
#....
#join sent[i] and sent[i+1] element
#....
Try something like this
pattern = re.compile(r"((\w+\/Name\s*)+)")
names = pattern.findall(your_string)
for name in names:
print(''.join(name[0].split('/Name')))
I'm thinking about a two-phase solution
r = re.compile(r'\w+\/Name(?:\ \w+\/Name)*')
result = r.findall(s)
# -> ['Petter/Name Jack/Name', 'Jonas/Name Van/Name Dame/Name']
for r in result:
print(r.replace('/Name', ''))
# -> Petter Jack
# -> Jonas Van Dame

How to sum up the word count for each person in a dialogue?

I'm starting to learn Python and I'm trying to write a program that would import a text file, count the total number of words, count the number of words in a specific paragraph (said by each participant, described by 'P1', 'P2' etc.), exclude these words (i.e. 'P1' etc.) from my word count, and print paragraphs separately.
Thanks to #James Hurford I got this code:
words = None
with open('data.txt') as f:
words = f.read().split()
total_words = len(words)
print 'Total words:', total_words
in_para = False
para_type = None
paragraph = list()
for word in words:
if ('P1' in word or
'P2' in word or
'P3' in word ):
if in_para == False:
in_para = True
para_type = word
else:
print 'Words in paragraph', para_type, ':', len(paragraph)
print ' '.join(paragraph)
del paragraph[:]
para_type = word
else:
paragraph.append(word)
else:
if in_para == True:
print 'Words in last paragraph', para_type, ':', len(paragraph)
print ' '.join(paragraph)
else:
print 'No words'
My text file looks like this:
P1: Bla bla bla.
P2: Bla bla bla bla.
P1: Bla bla.
P3: Bla.
The next part I need to do is summing up the words for each participant. I can only print them, but I don't know how to return/reuse them.
I would need a new variable with word count for each participant that I could manipulate later on, in addition to summing up all the words said by each participant, e.g.
P1all = sum of words in paragraph
Is there a way to count "you're" or "it's" etc. as two words?
Any ideas how to solve it?
I would need a new variable with word count for each participant that I could manipulate later on
No, you would need a Counter (Python 2.7+, else use a defaultdict(int)) mapping persons to word counts.
from collections import Counter
#from collections import defaultdict
words_per_person = Counter()
#words_per_person = defaultdict(int)
for ln in inputfile:
person, text = ln.split(':', 1)
words_per_person[person] += len(text.split())
Now words_per_person['P1'] contains the number of words of P1, assuming text.split() is a good enough tokenizer for your purposes. (Linguists disagree about the definition of word, so you're always going to get an approximation.)
Congrats on beginning your adventure with Python! Not everything in this post might make sense right now but bookmark it and comeback to it if it seems helpful later. Eventually you should try to move from scripting to software engineering, and here are a few ideas for you!
With great power comes great responsibility, and as a Python developer you need to be more disciplined than other languages which don't hold your hand and enforce "good" design.
I find it helps to start with a top-down design.
def main():
text = get_text()
p_text = process_text(text)
catalogue = process_catalogue(p_text)
BOOM! You just wrote the whole program -- now you just need to back and fill in the blanks! When you do it like this, it seems less intimidating. Personally, I don't consider myself smart enough to solve very big problems, but I'm a pro at solving small problems. So lets tackle one thing at a time. I'm going to start with 'process_text'.
def process_text(text):
b_text = bundle_dialogue_items(text)
f_text = filter_dialogue_items(b_text)
c_text = clean_dialogue_items(f_text)
I'm not really sure what those things mean yet, but I know that text problems tend to follow a pattern called "map/reduce" which means you perform and operation on something and then you clean it up and combine, so I put in some placeholder functions. I might go back and add more if necessary.
Now let's write 'process_catalogue'. I could've written "process_dict" but that sounded lame to me.
def process_catalogue(p_text):
speakers = make_catalogue(c_text)
s_speakers = sum_words_per_paragraph_items(speakers)
t_speakers = total_word_count(s_speakers)
Cool. Not too bad. You might approach this different than me, but I thought it would make sense to aggregate the items, the count the words per paragraph, and then count all the words.
So, at this point I'd probably make one or two little 'lib' (library) modules to back-fill the remaining functions. For the sake you being able to run this without worrying about imports, I'm going to stick it all in one .py file, but eventually you'll learn how to break these up so it looks nicer. So let's do this.
# ------------------ #
# == process_text == #
# ------------------ #
def bundle_dialogue_items(lines):
cur_speaker = None
paragraphs = Counter()
for line in lines:
if re.match(p, line):
cur_speaker, dialogue = line.split(':')
paragraphs[cur_speaker] += 1
else:
dialogue = line
res = cur_speaker, dialogue, paragraphs[cur_speaker]
yield res
def filter_dialogue_items(lines):
for name, dialogue, paragraph in lines:
if dialogue:
res = name, dialogue, paragraph
yield res
def clean_dialogue_items(flines):
for name, dialogue, paragraph in flines:
s_dialogue = dialogue.strip().split()
c_dialouge = [clean_word(w) for w in s_dialogue]
res = name, c_dialouge, paragraph
yield res
aaaand a little helper function
# ------------------- #
# == aux functions == #
# ------------------- #
to_clean = string.whitespace + string.punctuation
def clean_word(word):
res = ''.join(c for c in word if c not in to_clean)
return res
So it may not be obvious but this library is designed as a data processing pipeline. There several ways to process data, one is pipeline processing and another is batch processing. Let's take a look at batch processing.
# ----------------------- #
# == process_catalogue == #
# ----------------------- #
speaker_stats = 'stats'
def make_catalogue(names_with_dialogue):
speakers = {}
for name, dialogue, paragraph in names_with_dialogue:
speaker = speakers.setdefault(name, {})
stats = speaker.setdefault(speaker_stats, {})
stats.setdefault(paragraph, []).extend(dialogue)
return speakers
word_count = 'word_count'
def sum_words_per_paragraph_items(speakers):
for speaker in speakers:
word_stats = speakers[speaker][speaker_stats]
speakers[speaker][word_count] = Counter()
for paragraph in word_stats:
speakers[speaker][word_count][paragraph] += len(word_stats[paragraph])
return speakers
total = 'total'
def total_word_count(speakers):
for speaker in speakers:
wc = speakers[speaker][word_count]
speakers[speaker][total] = 0
for c in wc:
speakers[speaker][total] += wc[c]
return speakers
All these nested dictionaries are getting a little complicated. In actual production code I would replace these with some more readable classes (along with adding tests and docstrings!!), but I don't want to make this more confusing than it already is! Alright, for your convenience below is the whole thing put together.
import pprint
import re
import string
from collections import Counter
p = re.compile(r'(\w+?):')
def get_text_line_items(text):
for line in text.split('\n'):
yield line
def bundle_dialogue_items(lines):
cur_speaker = None
paragraphs = Counter()
for line in lines:
if re.match(p, line):
cur_speaker, dialogue = line.split(':')
paragraphs[cur_speaker] += 1
else:
dialogue = line
res = cur_speaker, dialogue, paragraphs[cur_speaker]
yield res
def filter_dialogue_items(lines):
for name, dialogue, paragraph in lines:
if dialogue:
res = name, dialogue, paragraph
yield res
to_clean = string.whitespace + string.punctuation
def clean_word(word):
res = ''.join(c for c in word if c not in to_clean)
return res
def clean_dialogue_items(flines):
for name, dialogue, paragraph in flines:
s_dialogue = dialogue.strip().split()
c_dialouge = [clean_word(w) for w in s_dialogue]
res = name, c_dialouge, paragraph
yield res
speaker_stats = 'stats'
def make_catalogue(names_with_dialogue):
speakers = {}
for name, dialogue, paragraph in names_with_dialogue:
speaker = speakers.setdefault(name, {})
stats = speaker.setdefault(speaker_stats, {})
stats.setdefault(paragraph, []).extend(dialogue)
return speakers
def clean_dict(speakers):
for speaker in speakers:
stats = speakers[speaker][speaker_stats]
for paragraph in stats:
stats[paragraph] = [''.join(c for c in word if c not in to_clean)
for word in stats[paragraph]]
return speakers
word_count = 'word_count'
def sum_words_per_paragraph_items(speakers):
for speaker in speakers:
word_stats = speakers[speaker][speaker_stats]
speakers[speaker][word_count] = Counter()
for paragraph in word_stats:
speakers[speaker][word_count][paragraph] += len(word_stats[paragraph])
return speakers
total = 'total'
def total_word_count(speakers):
for speaker in speakers:
wc = speakers[speaker][word_count]
speakers[speaker][total] = 0
for c in wc:
speakers[speaker][total] += wc[c]
return speakers
def get_text():
text = '''BOB: blah blah blah blah
blah hello goodbye etc.
JERRY:.............................................
...............
BOB:blah blah blah
blah blah blah
blah.
BOB: boopy doopy doop
P1: Bla bla bla.
P2: Bla bla bla bla.
P1: Bla bla.
P3: Bla.'''
text = get_text_line_items(text)
return text
def process_catalogue(c_text):
speakers = make_catalogue(c_text)
s_speakers = sum_words_per_paragraph_items(speakers)
t_speakers = total_word_count(s_speakers)
return t_speakers
def process_text(text):
b_text = bundle_dialogue_items(text)
f_text = filter_dialogue_items(b_text)
c_text = clean_dialogue_items(f_text)
return c_text
def main():
text = get_text()
c_text = process_text(text)
t_speakers = process_catalogue(c_text)
# take a look at your hard work!
pprint.pprint(t_speakers)
if __name__ == '__main__':
main()
So this script is almost certainly overkill for this application, but the point is to see what (questionably) readable, maintainable, modular Python code might look like.
Pretty sure output looks something like:
{'BOB': {'stats': {1: ['blah',
'blah',
'blah',
'blah',
'blah',
'hello',
'goodbye',
'etc'],
2: ['blah',
'blah',
'blah',
'blah',
'blah',
'blah',
'blah'],
3: ['boopy', 'doopy', 'doop']},
'total': 18,
'word_count': Counter({1: 8, 2: 7, 3: 3})},
'JERRY': {'stats': {1: ['', '']}, 'total': 2, 'word_count': Counter({1: 2})},
'P1': {'stats': {1: ['Bla', 'bla', 'bla'], 2: ['Bla', 'bla']},
'total': 5,
'word_count': Counter({1: 3, 2: 2})},
'P2': {'stats': {1: ['Bla', 'bla', 'bla', 'bla']},
'total': 4,
'word_count': Counter({1: 4})},
'P3': {'stats': {1: ['Bla']}, 'total': 1, 'word_count': Counter({1: 1})}}
You can do this with two variables. One to keep track of what person is speaking, the other to keep the paragraphs for the persons speaking. For storing the paragraphs and associating who it is that the paragraph belongs to use a dict with the person as the key and a list of paragraphs that person said associated with this key.
para_dict = dict()
para_type = None
for word in words:
if ('P1' in word or
'P2' in word or
'P3' in word ):
#extract the part we want leaving off the ':'
para_type = word[:2]
#create a dict with a list of lists
#to contain each paragraph the person uses
if para_type not in para_dict:
para_dict[para_type] = list()
para_dict[para_type].append(list())
else:
#Append the word to the last list in the list of lists
para_dict[para_type][-1].append(word)
From here you can sum up the number of words spoken thus
for person, para_list in para_dict.items():
counts_list = list()
for para in para_list:
counts_list.append(len(para))
print person, 'spoke', sum(counts_list), 'words'

Categories

Resources