Find and replace in string re.insensitive - python

I have this code to find matches in a string, im using in a search to mark the wordds that match my search, i need this to be case insensitive, the issue here is that it replaces the word by the one we search.
val_to_searchp = "this Text string has alot of teXt"
word = "TEXT"
pal2rep = str(":::")+word+str(":::")
val_to_search = re.sub(re.escape(word), pal2rep, val_to_searchp, flags=re.IGNORECASE)
this will return
"this :::TEXT::: string has alot of :::TEXT:::"
I need it to return
"this :::Text::: string has alot of :::teXt:::"
Also tryed with this but its not working very well :(
f = 0
s = 0
val_to_search = val_to_searchp
for m in re.finditer(str(word), str(val_to_searchp)):
inicio = int(m.start()+s)
fim = int(m.end()+f)
val_to_search = val_to_search[:inicio] \
+ str(":::") \
+ val_to_search[inicio:fim] \
+ str(":::") \
+ val_to_search[fim:].strip()
f = f+2
s = s+1
This is my actuall code
def findtext():
if len(str(findtext_inp.get('1.0', END)))>1:
val_to_searchp = str(respon_txt.get(1.0, END).replace(html.unescape('⛔'), "").strip())
respon_txt.delete(1.0, END)
word = str(findtext_inp.get('1.0', END).strip())
pal2rep = str(str(html.unescape('⛔'))+word+str(html.unescape('⛔')))
val_to_search = re.sub(re.escape(word), pal2rep, val_to_searchp, flags=re.IGNORECASE)
"""
f = 0
s = 0
for m in re.finditer(str(word), str(val_to_search)):
inicio = int(m.start()+s)
fim = int(m.end()+f)
val_to_search = val_to_search[:inicio] \
+ str(html.unescape('⛔')) \
+ val_to_search[inicio:fim] \
+ str(html.unescape('⛔')) \
+ val_to_search[fim:].strip()
f = f+2
s = s+1
"""
respon_txt.insert(1.0, val_to_search)#val_to_search.replace(findtext_inp.get('1.0', END).strip() , str(html.unescape('⛔')+findtext_inp.get('1.0', END).strip())+html.unescape('⛔')))

I'm sure there's a way to do this with RE but it's really trivial without the aid of that module.
val_to_searchp = "this Text string has alot of teXt\nThis also has a lot of text"
text = 'TEXT'
def func(s, txt):
txt = txt.lower()
result = []
for line in s.split('\n'):
for i, e in enumerate(t := line.split()):
if e.lower() == txt:
t[i] = f':::{e}:::'
result.append(' '.join(t))
return '\n'.join(result)
print(func(val_to_searchp, text))
Output:
this :::Text::: string has alot of :::teXt:::
This also has a lot of :::text:::

This is a rewrite of my original answer. In the comments for that answer you will see that the OP has changed his mind about how this needs to work. This now (hopefully) complies with the altered specification:
val_to_searchp = '''{\"configurationKey\":[{\"key\":\"GetMaxKeys\",\"readonly\":true,\"value\":\"20\"}'''
text = 'GetMaxKeys'
def func(s, txt):
result = []
sl = s.lower()
txt = txt.lower()
lt = len(txt)
offset = 0
while (i := sl[offset:].find(txt)) >= 0:
result.append(s[offset:i+offset])
offset += i
result.append(f':::{s[offset:offset+lt]}:::')
offset += lt
result.append(s[offset:])
return ''.join(result)
print(func(val_to_searchp, text))
Output:
{"configurationKey":[{"key":":::GetMaxKeys:::","readonly":true,"value":"20"}

Related

Extract words between the word "for" and the opening parenthesis "(" in an email subject line. Email subject line is the input

The client's name is after the word "for" and before the opening parenthesis "(" that starts the proposal number. I need to extract the client name to use to look up the deal in a future step. What would be the easiest way to set this up? Using Zapier Extract Pattern or to Use Zapier Code in Python?
I have tried this and it did not work. It seemed promising though.
input_data
client = Reminder: Leruths has sent you a proposal for Business Name (#642931)
import regex
rgx = regex.compile(r'(?si)(?|{0}(.*?){1}|{1}(.*?)
{0})'.format('for', '('))
s1 = 'client'
for s in [s1]:
m = rgx.findall
for x in m:
print x.strip()
I have also tried this and it did not work.
start = mystring.find( 'for' )
end = mystring.find( '(' )
if start != -1 and end != -1:
result = mystring[start+1:end]
I am looking for Business Name to be returned in my example.
Fastest way:
start = client.find('for')
end = client.find('(')
result = client[start+4:end-1]
print(result)
With regex:
result = re.search(r' for (.*) [(]', client)
print(result.group(1))
There is probably a cleaner way to do this, but here is another solution without regex
client = "Reminder: Leruths has sent you a proposal for Business Name (#642931)"
cs = client.split(" ")
name = ""
append = False
for word in cs:
if "for" == word:
append = True
elif word.startswith("("):
append = False
if append is True and word != "for":
name += (word + " ")
name = name.strip()
print(name)
Another method:
client = "Reminder: Leruths has sent you a proposal for Business Name (#642931)"
cs = client.split(" ")
name = ""
forindex = cs.index("for")
for i in range(forindex+1, len(cs)):
if cs[i].startswith("("):
break
name += cs[i] + " "
name = name.strip()
print(name)
Running the code below gives:
Regex method took 2.3912417888641357 seconds
Search word by word method took 4.78193998336792 seconds
Search with list index method took 3.1756017208099365 seconds
String indexing method took 0.8496286869049072 seconds
Code to check the fastest to get the name over a million tries:
import re
import time
client = "Reminder: Leruths has sent you a proposal for Business Name (#642931)"
def withRegex(client):
result = re.search(r' for (.*) [(]', client)
return(result.group(1))
def searchWordbyWord(client):
cs = client.split(" ")
name = ""
append = False
for word in cs:
if "for" == word:
append = True
elif word.startswith("("):
append = False
if append is True and word != "for":
name += (word + " ")
name = name.strip()
return name
def searchWithListIndex(client):
cs = client.split(" ")
name = ""
forindex = cs.index("for")
for i in range(forindex+1, len(cs)):
if cs[i].startswith("("):
break
name += cs[i] + " "
name = name.strip()
return name
def stringIndexing(client):
start = client.find('for')
end = client.find('(')
result = client[start+4:end-1]
return result
wr = time.time()
for x in range(1,1000000):
withRegex(client)
wr = time.time() - wr
print("Regex method took " + str(wr) + " seconds")
sw = time.time()
for x in range(1,1000000):
searchWordbyWord(client)
sw = time.time() - sw
print("Search word by word method took " + str(sw) + " seconds")
wl = time.time()
for x in range(1,1000000):
searchWithListIndex(client)
wl = time.time() - wl
print("Search with list index method took " + str(wl) + " seconds")
si = time.time()
for x in range(1,1000000):
stringIndexing(client)
si = time.time() - si
print("String indexing method took " + str(si) + " seconds")

Python : How to translate?

the program is when user input"8#15#23###23#1#19###9#20"
output should be "HOW WAS IT"
However,it could not work to show space(###).
enter code here
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
def from_abstract(s):
result = ''
for word in s.split('*'):
result = result +ABSTRACT_SHIFTED.get(word)
return result
This would do the trick:
#!/usr/bin/env python
InputString = "8#15#23###23#1#19###9#20"
InputString = InputString.replace("###", "##")
InputString = InputString.split("#")
DecodedMessage = ""
for NumericRepresentation in InputString:
if NumericRepresentation == "":
NumericRepresentation = " "
DecodedMessage += NumericRepresentation
continue
else:
DecodedMessage += chr(int(NumericRepresentation) + 64)
print(DecodedMessage)
Prints:
HOW WAS IT
you can also use a regex
import re
replacer ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
reversed = {value:key for key,value in replacer.items()}
# Reversed because regex is greedy and it will match 1 before 15
target = '8#15#23###23#1#19###9#20'
pattern = '|'.join(map(lambda x: x + '+', list(reversed.keys())[::-1]))
repl = lambda x: reversed[x.group(0)]
print(re.sub(pattern, string=target, repl=repl))
And prints:
HOW WAS IT
With a couple minimal changes to your code it works.
1) split on '#', not '*'
2) retrieve ' ' by default if a match isn't found
3) use '##' instead of '###'
def from_abstract(s):
result = ''
for word in s.replace('###','##').split('#'):
result = result +ABSTRACT_SHIFTED.get(word," ")
return result
Swap the key-value pairs of ABSTRACT and use simple split + join on input
ip = "8#15#23###23#1#19###9#20"
ABSTRACT = dict((v,k) for k,v in ABSTRACT.items())
''.join(ABSTRACT.get(i,' ') for i in ip.split('#')).replace(' ', ' ')
#'HOW WAS IT'
The biggest challenge here is that "#" is used as a token separator and as the space character, you have to know the context to tell which you've got at any given time, and that makes it difficult to simply split the string. So write a simple parser. This one will accept anything as the first character in a token and then grab everything until it sees the next "#".
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
user_input = "8#15#23###23#1#19###9#20"
def from_abstract(s):
result = []
while s:
print 'try', s
# tokens are terminated with #
idx = s.find("#")
# ...except at end of line
if idx == -1:
idx = len(s) - 1
token = s[:idx]
s = s[idx+1:]
result.append(ABSTRACT_SHIFTED.get(token, ' '))
return ''.join(result)
print from_abstract(user_input)

Python find function selects one match per line

I am trying to make a simple text editor using python. I am now trying to make a find function. This is what I've got:
def Find():
text = textArea.get('1.0', END+'-1c').lower()
input = simpledialog.askstring("Find", "Enter text to find...").lower()
startindex = []
endindex = []
lines = 0
if input in text:
text = textArea.get('1.0', END+'-1c').lower().splitlines()
for var in text:
character = text[lines].index(input)
start = str(lines + 1) + '.' + str(character)
startindex.append(start)
end = str(lines + 1) + '.' + str(character + int(len(input)))
endindex.append(end)
textArea.tag_add('select', startindex[lines], endindex[lines])
lines += 1
textArea.tag_config('select', background = 'green')
This will succesfully highlight words that match the users input with a green background. But the problem is, that it only highlights the first match every line, as you can see here.
I want it to highlight all matches.
Full code here: https://pastebin.com/BkuXN5pk
Recommend using the text widget's built-in search capability. Shown using python3.
from tkinter import *
root = Tk()
textArea = Text(root)
textArea.grid()
textArea.tag_config('select', background = 'green')
f = open('mouse.py', 'r')
content = f.read()
f.close()
textArea.insert(END, content)
def Find(input):
start = 1.0
length = len(input)
while 1:
pos = textArea.search(input, start, END)
if not pos:
break
end_tag = pos + '+' + str(length) + 'c'
textArea.tag_add('select', pos, end_tag)
start = pos + '+1c'
Find('display')
root.mainloop()

Trouble with recursive text splitting

trying to split text via text-defined boundary markers using recursion and create a list of lists and strings containing all of the organized parts of the original text file.
The split isn't happening.
Here is the short version: The real problem script:
def separate(text,boundary = None):
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(text,boundary)
i += 1
pdb.set_trace()
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
ar = re.split(r'(?P<boundary>)(?!--)',chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj
I've posted this script before and people wanted me to post it in its entirety so I'll do that
#Textbasics email parser
#based on a "show original" file converted into text
from sys import argv
import re, os, pdb, types
script, filename = argv
text = open(filename).read()
type = "text only" #Set the default type of email
#cut the email up by sections
#--A section is defined as any time there are two line breaks in a row
textList = re.split(r"\n\n", text)
header = textList[0]
if re.search(r'MIME-Version',header):
type = "MIME"
# If mail has no attachments, parse as a text-only email
class Parser(object):
def __init__(self,textList):
a = 1
self.body = ""
self.textList = textList
self.header = textList[0]
while a < len(textList):
self.body = self.body + textList[a] + '\n\n'
a += 1
m = re.search(r'(?<=Subject: ).*', self.header)
self.subject = m.group(0)
m = re.search(r'(?<=From: ).*', self.header)
self.fromVar = m.group(0)
m = re.search(r'(?<=To: ).*', self.header)
self.toVar = m.group(0)
m = re.search(r'(?<=Date: )\w+\s\w+\s\w+', self.header)
self.date = m.group(0)
def returnParsed(self,descriptor = "all"):
if descriptor == "all":
retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body
return retv
if descriptor == "subject":
return self.subject
if descriptor == "fromVar":
return self.fromVar
if descriptor == "toVar":
return self.toVar
if descriptor == "date":
return self.date
if descriptor == "body":
return self.body
class MIMEParser(Parser):
class MIMEDataDecoder(object):
def __init__(self,decodeString,type):
pass
def __init__(self,textList):
self.textList = textList
self.nestedItems = []
newItem = NestedItem(self)
newItem.setContentType("Header")
newItem.setValue(self.textList[0])
self.nestedItems.append(newItem)
if re.search(r'(boundary=)',newItem.value):
helperItem = NestedItem(self)
helperItem.value = (self.textList[0])
m = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
helperItem.setContentType(m.group(0))
self.nestedItems.append(helperItem)
self.organizeData()
"""i = 0
while i < len(self.textList):
newItem = NestedItem(self)
ct = self.nextContentType
newItem.setContentType(ct)
newItem.setValue(self.textList[i])
self.nestedItems.append(newItem)
m = re.search(r'(?<=Content-Type: ).+(?=;)',self.textList[i])
if m:
self.nextContentType = m.group(0)
i += 1
"""
def nestItem (self,item):
self.nestedItems.append(item)
def organizeData(self):
self.nestLevel = 1
self.currentSuper = self
m = re.search(r'(?<=boundary=).*',self.textList[0])
self.currentBoundary = m.group(0)
self.currentList = self.textList
self.currentList.remove(self.textList[0])
self.formerObjectDatabase = {}
pdb.set_trace()
while self.nestLevel > 0:
i = 0
while i < len(self.currentList):
boundary = self.currentBoundary
#If block is a "normal block", containing a current boundary identifier
p = re.search(r'--(?P<boundary>)(?!--)', text)
if p:
newItem = NestedItem(self.currentSuper)
newItem.setValue(self.currentList[i])
r = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
if r:
newItem.setContentType(r.group(0))
self.currentObject = newItem
self.currentSuper.nestItem(self.currentObject)
#If the block contains a new block boundary
m = re.search(r'(?<=boundary=).*',self.currentList[i])
if m:
#begin new layer of recursive commands
newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary)
self.formerObjectDatabase[self.nestLevel] = newFormerObject
self.currentSuper = self.currentObject
self.nestLevel += 1
self.currentBoundary = m.group(0)
boundary = self.currentBoundary
#self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i])
boundary = self.currentBoundary
#If block contains an "end of boundary" marker
q = re.search(r'(?P<boundary>)--', text)
if q:
self.nestLevel -= 1
currentObject = self.formerObjectDatabase[self.nestLevel]
self.currentList = currentObject.formerList
self.currentSuper = currentObject.formerSuper
self.currentBoundary = currentObject.formerBoundary
i += 1
class FormerCurrentObject:
def __init__(self,formerList,formerSuper,formerBoundary):
self.formerList = formerList
self.formerSuper = formerSuper
self.formerBoundary = formerBoundary
def printAll(self):
print "printing all: %d" % len(self.nestedItems)
i = 0
while i < len(self.nestedItems):
print "printing out item %d" % i
self.nestedItems[i].printOut()
i += 1
class NestedItem(object):
def __init__(self,superObject,contentType=" ",value = " "):
self.superObject = superObject
self.contentType = contentType
self.value = value
self.nestedItems = []
def nestItem(self,item):
self.nestedItems.append(item)
def printOut(self,printBuffer = ""):
print printBuffer + '++%s' % self.contentType
print printBuffer + self.value
a = 0
printBuffer = printBuffer + " "
while a < len(self.nestedItems):
self.nestedItems[a].printOut(printBuffer)
def setContentType(self,contentType):
self.contentType = contentType
def setValue(self,value):
self.value = value
if type == "text only":
p = Parser(textList)
print p.returnParsed()
# ---PROBLEM CODE STARTS HERE---
def separate(text,boundary = None):
pdb.set_trace()
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
textList = [text]
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(textList,boundary)
i += 1
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.ListType: #<<--error occurs here
for obj in chunk:
recursiveSplit(obj,boundary)
if type(chunk) is types.StringType:
list = re.split(r'(?P<boundary>)(?!--)',chunk)
return list
return None
#---PROBLEM CODE ENDS(?) HERE---
if type == "MIME":
#separate the text file instead by its boundary identifier
p = MIMEParser(separate(text))
p.printAll()
You can use any MIME type email for this to run. Here's the one I've been using for convenience
MIME-Version: 1.0
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT)
Date: Fri, 3 May 2013 08:08:21 -0400
Delivered-To: MYEMAIL#gmail.com
Message-ID: <#mail.gmail.com>
Subject: MiB 5/3/13 7:43AM (EST)
From: ME<MYEMAIL#gmail.com>
To: SOMEONE <SOMEONE#aol.com>
Content-Type: multipart/mixed; boundary=BNDRY1
--BNDRY1
Content-Type: multipart/alternative; boundary=BNDRY2
--BNDRY2
Content-Type: text/plain; charset=ISO-8859-1
-changed signature methods to conform more to working clinic header
methods(please test/not testable in simulator)
-confirmed that signature image is showing up in simulator. Awaiting
further tests
-Modified findings spacing/buffer. See if you like it
--BNDRY2
Content-Type: text/html; charset=ISO-8859-1
<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div>
<div style>-Modified findings spacing/buffer. See if you like it</div></div>
--BNDRY2--
--BNDRY1
Content-Type: application/zip; name="Make it Brief.ipa.zip"
Content-Disposition: attachment; filename="Make it Brief.ipa.zip"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_hg9biuno0
<<FILE DATA>>
--BNDRY1--
The issue was in the regex. There may be a cooler way to do it, but I just created a search string literal based off of the variables.
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
#ar = re.split(r'(?P<boundary>)(?!--)',chunk)
searchString = "--%s" % boundary
print searchString
ar = re.split(searchString,chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj

I have 2 expressions which should be valid that are not recognized as such

import re
cards1 = "'F'*4 + 'H'*10"; cards2 = 'FFHH'
def find_number_of_cards(cards):
regexp = re.compile(r"(?P<FandH>[FH]+) | (('F')[*](?P<F>[0-9]+)\s*[+]\s*('H')[*](?P<H>[0-9]+))")
result = regexp.search(cards)
if result == None:
return ("The expression given is not valid.")
else:
FnH = result.group('FandH')
F = result.group('F')
H = result.group('H')
if FnH == None:
return F, H
else:
return "Blank."
print(find_number_of_cards(cards1))
print(find_number_of_cards(cards2))
Change this:
regexp = re.compile(r"(?P<FandH>[FH]+) | (('F')[*](?P<F>[0-9]+)\s*[+]\s*('H')[*](?P<H>[0-9]+))")
to this:
regexp = re.compile(r"(?P<FandH>[FH]+)|(('F')[*](?P<F>[0-9]+)\s*[+]\s*('H')[*](?P<H>[0-9]+))")
It's looking for a space in the string, which isn't there.

Categories

Resources