Finding if common sub-string exists with regex - python

I want to find if there is a sub-string of the string "chef" but in order in another given string with length > 1.
So basically we want strings ch, he, ef, che, hef or chef to exist in the given string.
Ex:
1> kefaa
Here we have ef which is part of ''chef'' so it is a valid string.
2> fhlasek
Here we have fh which characters exist in 'chef' but the sequence is incorrect so it is invalid.
I have this code that works but here adding substrings manually is easy as the string 'chef' has quite fewer possibilities but I want a code that will work for any given string.
import re
pattern = r"(ch|he|ef|che|hef|chef)"
s = input()
res = re.search(pattern, s)
if bool(res):
print('YES')
else:
print('NO')
P.S. Im sorry if this question was already asked and solved, I was unable to find it.
Thank You.

Pure Python:
def test(txt, string):
le = len(txt)
fragments = [txt[i:j] for i in range(le) for j in range(i+1, le+1) if j-i>1]
# 'chef' --> ['ch', 'che', 'chef', 'he', 'hef', 'ef']
for fragment in fragments:
if fragment in string: return 'YES';
return 'NO'
print(test("chef", "ch")) # YES
print(test("chef", "che")) # YES
print(test("chef", "c")) # NO
print(test("chef", "fh")) # NO
print(test("chef", "kefaa")) # YES
If you need regexp here you go:
import re
def get_reg(txt,s):
le = len(txt)
fragments = [txt[i:j] for i in range(le) for j in range(i+1, le+1) if j-i>1]
return bool(re.search("|".join(fragments),s))
# 'chef' --> 'ch|che|chef|he|hef|ef'
print(get_reg("chef","ch")) # True
print(get_reg("chef","che")) # True
print(get_reg("chef","c")) # False
print(get_reg("chef","fh")) # False
print(get_reg("chef","kefaa")) # True
Recurse:
import re
def get_framgents(word):
for i in range(len(word)-1):
fragments.append(word[:len(word)-i])
if len(word)>0:
get_framgents(word[1:])
word = 'chef'
fragments = []
get_framgents(word) # --> ['chef','che','ch','hef','he','ef']
fragments = '|'.join(fragments) # --> 'chef|che|ch|hef|he|ef'
print(bool(re.search(fragments, "ch"))) # True
print(bool(re.search(fragments, "che"))) # True
print(bool(re.search(fragments, "c"))) # False
print(bool(re.search(fragments, "fh"))) # False
print(bool(re.search(fragments, "kaeef"))) # True

You can loop through the word and build a custom regex, then use that regex in your search:
from re import search, compile
word = "chef"
s = input()
pattern = []
for i in range(len(word) - 1):
pattern.append(word[i] + word[i+1])
pattern = compile("|".join(pattern))
if bool(search(pattern, s)):
print("Yes")
else:
print("No")

Related

python replace content of url between <br/> and <br/>

There's a string like this:
<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>
I want to delete the content:
https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0
Just keep:
<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>
My code:
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
How to do it?
You can use re.sub from regex module:
import re
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
print(re.sub(r'http[^<]+', '', mystring))
Output:
<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>
You can do this with regex replace:
Find: <br/>https?://[^<]*</br>
Replace: <br/></br>
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
# remove 'https://sites.google.com/aaa-net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0'
resultstring = '<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>'
length = len(mystring)
startPos = -1
endPos = -1
for i in range(length):
subString = mystring[i:]
if subString.startswith('<br/>'):
if(startPos == -1):
startPos = i
continue # check from next character to get endPos
if(endPos == -1):
endPos = i
firstSubString = mystring[:startPos + 5] # 5 = the characher size of '<br/>'
lastSubString = mystring[endPos:]
completeResult = firstSubString + lastSubString
print(completeResult, completeResult == resultstring)
print(completeResult, resultstring)
import re
mystring = '<p>Millions of people watch TV.</p><br/>https://sites.google.com/aaa-
net.bb.cc/be-do-have/%E3%83%9B%E3%83%BC%E3%83%A0<br/><p>Good boy!</p><br/>'
print(re.sub("(?:<br/>https)([\s\S]*?)(?=<br/>)",'<br/>',mystring))
Output:
<p>Millions of people watch TV.</p><br/><br/><p>Good boy!</p><br/>

Python: map NLTK Stanford POS tags to WordNet POS tags

I'm reading a list of sentences and tagging each word with NLTK's Stanford POS tagger. I get outputs like so:
wordnet_sense = []
for o in output:
a = st.tag(o)
wordnet_sense.append(a)
outputs: [[(u'feel', u'VB'), (u'great', u'JJ')], [(u'good', u'JJ')]]
I want to map these words with their POS, so that they are recognised in WordNet.
I've attempted this:
sense = []
for i in wordnet_sense:
tmp = []
for tok, pos in i:
lower_pos = pos[0].lower()
if lower_pos in ['a', 'n', 'v', 'r', 's']:
res = wn.synsets(tok, lower_pos)
if len(res) > 0:
a = res[0]
else:
a = "[{0}, {1}]".format(tok, pos)
tmp.append(a)
sense.append(tmp)
print sense
outputs: [Synset('feel.v.01'), '[great, JJ]'], ['[good, JJ]']]
So feel is recognised as a verb, but great and good are not recognised as adjectives. I've also checked if great and good actually belong in Wordnet because I thought they weren't being mapped if they weren't there, but they are. Can anyone help?
Here's a cute function from pywsd:
from nltk.corpus import wordnet as wn
def penn2morphy(penntag, returnNone=False):
morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
'VB':wn.VERB, 'RB':wn.ADV}
try:
return morphy_tag[penntag[:2]]
except:
return None if returnNone else ''
def wordnet_pos_code(tag):
if tag.startswith('NN'):
return wn.NOUN
elif tag.startswith('VB'):
return wn.VERB
elif tag.startswith('JJ'):
return wn.ADJ
elif tag.startswith('RB'):
return wn.ADV
else:
return ''
print wordnet_pos_code('NN')`
As well as the answer provided, I've found this that also works.

Python detect if string contains specific length substring of chars from specific set

So given a set of chars and length
s = set('abc')
l = 5
How can I ensure that a string doesn't contain substrings like
abcab
aaaaa
Length needs to be around 60 so I can't just generate all substrings.
You can iterate through each character of the string and keep track of the previous number of characters that are elements of s.
def hasSubstring(s, l):
length = 0
for c in str:
if c in s:
length += 1
else:
length = 0
if length > l:
return True
return False
What about using product and list-comprehension.
from itertools import product
s = set('abc')
l = 5
omit = ['abcab','aaaaa']
def sorter(s,l,omit):
s= ''.join(list(s))
unsrted = [''.join(it) for it in list(product(s,repeat=l))]
filrted = [value for value in unsrted if value not in omit]#just filter here based on the list omit
return filrted
print sorter(s, l, omit)
Output-
['aaaac', 'aaaab', 'aaaca', 'aaacc', 'aaacb', 'aaaba', 'aaabc', 'aaabb', 'aacaa', 'aacac', 'aacab', 'aacca', 'aaccc', 'aaccb', 'aacba', 'aacbc', 'aacbb', 'aabaa', 'aabac', 'aabab', 'aabca', 'aabcc', 'aabcb', 'aabba', 'aabbc', 'aabbb', 'acaaa', 'acaac', 'acaab', 'acaca', 'acacc', 'acacb', 'acaba', 'acabc', 'acabb', 'accaa', 'accac', 'accab', 'accca', 'acccc', 'acccb', 'accba', 'accbc', 'accbb', 'acbaa', 'acbac', 'acbab', 'acbca', 'acbcc', 'acbcb', 'acbba', 'acbbc', 'acbbb', 'abaaa', 'abaac', 'abaab', 'abaca', 'abacc', 'abacb', 'ababa', 'ababc', 'ababb', 'abcaa', 'abcac', 'abcca', 'abccc', 'abccb', 'abcba', 'abcbc', 'abcbb', 'abbaa', 'abbac', 'abbab', 'abbca', 'abbcc', 'abbcb', 'abbba', 'abbbc', 'abbbb', 'caaaa', 'caaac', 'caaab', 'caaca', 'caacc', 'caacb', 'caaba', 'caabc', 'caabb', 'cacaa', 'cacac', 'cacab', 'cacca', 'caccc', 'caccb', 'cacba', 'cacbc', 'cacbb', 'cabaa', 'cabac', 'cabab', 'cabca', 'cabcc', 'cabcb', 'cabba', 'cabbc', 'cabbb', 'ccaaa', 'ccaac', 'ccaab', 'ccaca', 'ccacc', 'ccacb', 'ccaba', 'ccabc', 'ccabb', 'cccaa', 'cccac', 'cccab', 'cccca', 'ccccc', 'ccccb', 'cccba', 'cccbc', 'cccbb', 'ccbaa', 'ccbac', 'ccbab', 'ccbca', 'ccbcc', 'ccbcb', 'ccbba', 'ccbbc', 'ccbbb', 'cbaaa', 'cbaac', 'cbaab', 'cbaca', 'cbacc', 'cbacb', 'cbaba', 'cbabc', 'cbabb', 'cbcaa', 'cbcac', 'cbcab', 'cbcca', 'cbccc', 'cbccb', 'cbcba', 'cbcbc', 'cbcbb', 'cbbaa', 'cbbac', 'cbbab', 'cbbca', 'cbbcc', 'cbbcb', 'cbbba', 'cbbbc', 'cbbbb', 'baaaa', 'baaac', 'baaab', 'baaca', 'baacc', 'baacb', 'baaba', 'baabc', 'baabb', 'bacaa', 'bacac', 'bacab', 'bacca', 'baccc', 'baccb', 'bacba', 'bacbc', 'bacbb', 'babaa', 'babac', 'babab', 'babca', 'babcc', 'babcb', 'babba', 'babbc', 'babbb', 'bcaaa', 'bcaac', 'bcaab', 'bcaca', 'bcacc', 'bcacb', 'bcaba', 'bcabc', 'bcabb', 'bccaa', 'bccac', 'bccab', 'bccca', 'bcccc', 'bcccb', 'bccba', 'bccbc', 'bccbb', 'bcbaa', 'bcbac', 'bcbab', 'bcbca', 'bcbcc', 'bcbcb', 'bcbba', 'bcbbc', 'bcbbb', 'bbaaa', 'bbaac', 'bbaab', 'bbaca', 'bbacc', 'bbacb', 'bbaba', 'bbabc', 'bbabb', 'bbcaa', 'bbcac', 'bbcab', 'bbcca', 'bbccc', 'bbccb', 'bbcba', 'bbcbc', 'bbcbb', 'bbbaa', 'bbbac', 'bbbab', 'bbbca', 'bbbcc', 'bbbcb', 'bbbba', 'bbbbc', 'bbbbb']

How to replace text in curly brackets with another text based on comparisons using Python Regex

I am quiet new to regular expressions. I have a string that looks like this:
str = "abc/def/([default], [testing])"
and a dictionary
dict = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
and using Python RE, I want str in this form, after comparisons of each element in dict to str:
str = "abc/def/(2.7, 2.1)"
Any help how to do it using Python RE?
P.S. its not the part of any assignment, instead it is the part of my project at work and I have spent many hours to figure out solution but in vain.
import re
st = "abc/def/([default], [testing], [something])"
dic = {'abc/def/[default]' : '2.7',
'abc/def/[testing]' : '2.1',
'bcd/xed/[something]' : '3.1'}
prefix_regex = "^[\w*/]*"
tag_regex = "\[\w*\]"
prefix = re.findall(prefix_regex, st)[0]
tags = re.findall(tag_regex, st)
for key in dic:
key_prefix = re.findall(prefix_regex, key)[0]
key_tag = re.findall(tag_regex, key)[0]
if prefix == key_prefix:
for tag in tags:
if tag == key_tag:
st = st.replace(tag, dic[key])
print st
OUTPUT:
abc/def/(2.7, 2.1, [something])
Here is a solution using re module.
Hypotheses :
there is a dictionary whose keys are composed of a prefix and a variable part, the variable part is enclosed in brackets ([])
the values are strings by which the variable parts are to be replaced in the string
the string is composed by a prefix, a (, a list of variable parts and a )
the variable parts in the string are enclosed in []
the variable parts in the string are separated by a comma followed by optional spaces
Python code :
import re
class splitter:
pref = re.compile("[^(]+")
iden = re.compile("\[[^]]*\]")
def __init__(self, d):
self.d = d
def split(self, s):
m = self.pref.match(s)
if m is not None:
p = m.group(0)
elts = self.iden.findall(s, m.span()[1])
return p, elts
return None
def convert(self, s):
p, elts = self.split(s)
return p + "(" + ", ".join((self.d[p + elt] for elt in elts)) + ")"
Usage :
s = "abc/def/([default], [testing])"
d = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
sp = splitter(d)
print(sp.convert(s))
output :
abc/def/(2.7, 2.1)
Regex is probably not required here. Hope this helps
lhs,rhs = str.split("/(")
rhs1,rhs2 = rhs.strip(")").split(", ")
lhs+="/"
print "{0}({1},{2})".format(lhs,dict[lhs+rhs1],dict[lhs+rhs2])
output
abc/def/(2.7,2.1)

Two issue about python OpenOPC library

Issues description and environments
The OpenOPC library is friendly and easy to use, the api is simple too, but I have found two issues during the development of a tool to record real time OPC items data.
The development environment is: Window 8.1, Python 2.7.6, wxpython 2.8 unicode
The testing environment is: Window XP SP3, Python 2.7.6, wxpython 2.8 unicode, Rockwell's soft logix as OPC Server
The deploy environment is: Window XP SP3, connected with Rockwell's real PLC, installed RSLogix 5000 and RSLinx Classic Gateway
Questions
the opc.list function doesn't list all the item of specify node both in testing and workstaion environment. The question is how to list the 't' from the opc server?
An int array 'dint100' and a dint 't' is added with RS logix 5000 at the scope of soft_1
With the default OPC client test tool from Rockwell it could list the new added 't'
With OpenOPC library, I couldn't find out how to list the item 't', but I could read it's value by opc.read('[soft_1]t') with it's tag.
If the 't' could be listed, it could be added into the IO tree of my tool.
The opc.servers function will encounter an OPCError on the deploy environment, but the client could connect the 'RSLinx OPC Server' directly with the server name. Does opc.servers function dependent on some special dll or service?
Any suggestions will be appreciated! Thanks in advance!
Consider that the browsing problems ("opc.list") may not be on your side. RSLinx is notorious for its broken OPC browsing. Try some test/simulation server from a different vendor, to test this hypothesis.
I realize that I'm really late to this game. I found what was causing this issue. OpenOPC.py assumes that there cannot be both a "Leaf" and a "Branch" on the same level. Replace the function ilist with this:
def ilist(self, paths='*', recursive=False, flat=False, include_type=False):
"""Iterable version of list()"""
try:
self._update_tx_time()
pythoncom.CoInitialize()
try:
browser = self._opc.CreateBrowser()
# For OPC servers that don't support browsing
except:
return
paths, single, valid = type_check(paths)
if not valid:
raise TypeError("list(): 'paths' parameter must be a string or a list of strings")
if len(paths) == 0: paths = ['*']
nodes = {}
for path in paths:
if flat:
browser.MoveToRoot()
browser.Filter = ''
browser.ShowLeafs(True)
pattern = re.compile('^%s$' % wild2regex(path) , re.IGNORECASE)
matches = filter(pattern.search, browser)
if include_type: matches = [(x, node_type) for x in matches]
for node in matches: yield node
continue
queue = []
queue.append(path)
while len(queue) > 0:
tag = queue.pop(0)
browser.MoveToRoot()
browser.Filter = ''
pattern = None
path_str = '/'
path_list = tag.replace('.','/').split('/')
path_list = [p for p in path_list if len(p) > 0]
found_filter = False
path_postfix = '/'
for i, p in enumerate(path_list):
if found_filter:
path_postfix += p + '/'
elif p.find('*') >= 0:
pattern = re.compile('^%s$' % wild2regex(p) , re.IGNORECASE)
found_filter = True
elif len(p) != 0:
pattern = re.compile('^.*$')
browser.ShowBranches()
# Branch node, so move down
if len(browser) > 0:
try:
browser.MoveDown(p)
path_str += p + '/'
except:
if i < len(path_list)-1: return
pattern = re.compile('^%s$' % wild2regex(p) , re.IGNORECASE)
# Leaf node, so append all remaining path parts together
# to form a single search expression
else:
###################################### JG Edit - Flip the next two rows comment/uncommented
p = '.'.join(path_list[i:])
# p = string.join(path_list[i:], '.')
pattern = re.compile('^%s$' % wild2regex(p) , re.IGNORECASE)
break
###################################### JG Edit - Comment this to return to original
browser.ShowBranches()
node_types = ['Branch','Leaf']
if len(browser) == 0:
lowest_level = True
node_types.pop(0)
else:
lowest_level = False
for node_type in node_types:
if node_type=='Leaf':
browser.ShowLeafs(False)
matches = filter(pattern.search, browser)
if not lowest_level and recursive:
queue += [path_str + x + path_postfix for x in matches]
else:
###################################### JG Edit - Flip the next two rows comment/uncommented
if lowest_level or node_type=='Leaf': matches = [exceptional(browser.GetItemID,x)(x) for x in matches]
# if lowest_level: matches = [exceptional(browser.GetItemID,x)(x) for x in matches]
if include_type: matches = [(x, node_type) for x in matches]
for node in matches:
if not node in nodes: yield node
nodes[node] = True
###################################### Uncomment this to return to original
# browser.ShowBranches()
# if len(browser) == 0:
# browser.ShowLeafs(False)
# lowest_level = True
# node_type = 'Leaf'
# else:
# lowest_level = False
# node_type = 'Branch'
# matches = filter(pattern.search, browser)
# if not lowest_level and recursive:
# queue += [path_str + x + path_postfix for x in matches]
# else:
# if lowest_level: matches = [exceptional(browser.GetItemID,x)(x) for x in matches]
# if include_type: matches = [(x, node_type) for x in matches]
# for node in matches:
# if not node in nodes: yield node
# nodes[node] = True
except pythoncom.com_error as err:
error_msg = 'list: %s' % self._get_error_str(err)
raise OPCError(error_msg)

Categories

Resources