regular expression for a string format - python

I have a string as
(device
(vfb
(xxxxxxxx)
(xxxxxxxx)
(location 0.0.0.0:5900)
)
)
(device
(console
(xxxxxxxx)
(xxxxxxxx)
(location 80)
)
)
I need to read the location line from "vfb" portion of the string. I have tried to use regular expression like
import re
re.findall(r'device.*?\vfb.*?\(.*?(.*?).*(.*?\))
But it doesn't give me the required output.

It's better to use a parser for problems like this. Fortunately, a parser would be rather trivial in your case:
def parse(source):
def expr(tokens):
t = tokens.pop(0)
if t != '(':
return {'value': t}
key, val = tokens.pop(0), {}
while tokens[0] != ')':
val.update(expr(tokens))
tokens.pop(0)
return {key:val}
tokens = re.findall(r'\(|\)|[^\s()]+', source)
lst = []
while tokens:
lst.append(expr(tokens))
return lst
Given the above snippet, this creates a structure like:
[{'device': {'vfb': {'location': {'value': '0.0.0.0:5900'}, 'xxxxxxxx': {}}}},
{'device': {'console': {'location': {'value': '80'}, 'xxxxxxxx': {}}}}]
Now you can iterate it and fetch whatever you need:
for item in parse(source):
try:
location = item['device']['vfb']['location']['value']
except KeyError:
pass

With that intro from Martijn Pieters, here is a pyparsing approach:
inputdata = """(device
(vfb
(xxxxxxxx)
(xxxxxxxx)
(location 0.0.0.0:5900)
)
)
(device
(console
(xxxxxxxx)
(xxxxxxxx)
(location 80)
)
)"""
from pyparsing import OneOrMore, nestedExpr
# a nestedExpr defaults to reading space-separated words within nested parentheses
data = OneOrMore(nestedExpr()).parseString(inputdata)
print (data.asList())
# recursive search to walk parsed data to find desired entry
def findPath(seq, path):
for s in seq:
if s[0] == path[0]:
if len(path) == 1:
return s[1]
else:
ret = findPath(s[1:], path[1:])
if ret is not None:
return ret
return None
print findPath(data, "device/vfb/location".split('/'))
prints:
[['device', ['vfb', ['xxxxxxxx'], ['xxxxxxxx'], ['location', '0.0.0.0:5900']]],
['device', ['console', ['xxxxxxxx'], ['xxxxxxxx'], ['location', '80']]]]
0.0.0.0:5900

Maybe this gets you started:
In [84]: data = '(device(vfb(xxxxxxxx)(xxxxxxxx)(location 0.0.0.0:5900)))'
In [85]: m = re.search(r"""
.....: vfb
.....: .*
.....: \(
.....: location
.....: \s+
.....: (
.....: [^\)]+
.....: )
.....: \)""", data, flags=re.X)
In [86]: m.group(1)
Out[86]: '0.0.0.0:5900'

Related

Check which optional parameters are supplied to a function call

My goal is to run through all the *.py files in a directory and look at each call to a specific function test_func. This function has some optional parameters and I need to audit when the function is called with the optional parameters. My thought is to use the ast library (specifically ast.walk()).
I suppose this is a static analysis problem.
# function definition
def test_func(
name: str,
*,
user: Optional['User'] = None,
request: Optional[WebRequest] = None,
**kwargs
) -> bool:
pass
# somewhere in another file ...
test_func('name0')
test_func('name1', request=request)
test_func('name1')
test_func('name2', user=user)
# figure out something like below:
# name0 is never given any optional parameters
# name1 is sometimes given request
# name2 is always given user
Here is a POC :
import typing
from typing import Optional
class User: pass
class WebRequest: pass
# function definition
def test_func(
name: str,
*,
user: Optional['User'] = None,
request: Optional[WebRequest] = None,
**kwargs
) -> bool:
pass
# somewhere in another file ...
test_func('name0')
test_func('name1', request=WebRequest())
test_func('name1')
test_func('name2', user=User())
# figure out something like below:
# name0 is never given any optional parameters
# name1 is sometimes given request
# name2 is always given user
with open(__file__, "rt") as py_file:
py_code = py_file.read()
import collections
each_call_kwargs_names_by_arg0_value: typing.Dict[str, typing.List[typing.Tuple[str, ...]]] = collections.defaultdict(list)
import ast
tree = ast.parse(py_code)
for node in ast.walk(tree):
if isinstance(node, ast.Call):
if hasattr(node.func, "id"):
name = node.func.id
elif hasattr(node.func, "attr"):
name = node.func.attr
elif hasattr(node.func, "value"):
name = node.func.value.id
else:
raise NotImplementedError
print(name)
if name == "test_func":
arg0_value = typing.cast(ast.Str, node.args[0]).s
each_call_kwargs_names_by_arg0_value[arg0_value].append(
tuple(keyword.arg for keyword in node.keywords)
)
for arg0_value, each_call_kwargs_names in each_call_kwargs_names_by_arg0_value.items():
frequency = "NEVER" if all(len(call_args) == 0 for call_args in each_call_kwargs_names) else \
"ALWAYS" if all(len(call_args) != 0 for call_args in each_call_kwargs_names) else \
"SOMETIMES"
print(f"{arg0_value!r} {frequency}: {each_call_kwargs_names}")
# Output :
# 'name0' NEVER: [()]
# 'name1' SOMETIMES: [('request',), ()]
# 'name2' ALWAYS: [('user',)]
You can use a recursive generator function to traverse an ast of your Python code:
import ast
def get_calls(d, f = ['test_func']):
if isinstance(d, ast.Call) and d.func.id in f:
yield None if not d.args else d.args[0].value, [i.arg for i in d.keywords]
for i in getattr(d, '_fields', []):
vals = (m if isinstance((m:=getattr(d, i)), list) else [m])
yield from [j for k in vals for j in get_calls(k, f = f)]
Putting it all together:
import os, collections
d = collections.defaultdict(list)
for f in os.listdir(os.getcwd()):
if f.endswith('.py'):
with open(f) as f:
for a, b in get_calls(ast.parse(f.read())):
d[a].append(b)
r = {a:{'verdict':'never' if not any(b) else 'always' if all(b) else 'sometimes', 'params':[i[0] for i in b if i]}
for a, b in d.items()}
Output:
{'name0': {'verdict': 'never', 'params': []},
'name1': {'verdict': 'sometimes', 'params': ['request']},
'name2': {'verdict': 'always', 'params': ['user']}}

Preferential parsing of sentence elements using infixnotation in Pyparsing

I have some sentences that I need to convert to regex code and I was trying to use Pyparsing for it. The sentences are basically search rules, telling us what to search for.
Examples of sentences -
LINE_CONTAINS this is a phrase
-this is an example search rule telling that the line you are searching on should have the phrase this is a phrase
LINE_STARTSWITH However we - this is an example search rule telling that the line you are searching on should start with the phrase However we
The rules can be combined too, like- LINE_CONTAINS phrase one BEFORE {phrase2 AND phrase3} AND LINE_STARTSWITH However we
Now, I am trying to parse these sentences and then convert them to regex code. All lines start with either of the 2 symbols mentioned above (call them line_directives). I want to be able to consider these line_directives, and parse them appropriately and do the same for the phrase that follow them, albeit differently parsed. Using help from Paul McGuire(here)and my own inputs, I have the following code-
from pyparsing import *
import re
UPTO, AND, OR, WORDS = map(Literal, "upto AND OR words".split())
keyword = UPTO | WORDS | AND | OR
LBRACE,RBRACE = map(Suppress, "{}")
integer = pyparsing_common.integer()
LINE_CONTAINS, LINE_STARTSWITH, LINE_ENDSWITH = map(Literal,
"""LINE_CONTAINS LINE_STARTSWITH LINE_ENDSWITH""".split()) # put option for LINE_ENDSWITH. Users may use, I don't presently
BEFORE, AFTER, JOIN = map(Literal, "BEFORE AFTER JOIN".split())
word = ~keyword + Word(alphas)
phrase = Group(OneOrMore(word))
upto_expr = Group(LBRACE + UPTO + integer("numberofwords") + WORDS + RBRACE)
class Node(object):
def __init__(self, tokens):
self.tokens = tokens
def generate(self):
pass
class LiteralNode(Node):
def generate(self):
print (self.tokens[0], 20)
for el in self.tokens[0]:
print (el,type(el), 19)
print (type(self.tokens[0]), 18)
return "(%s)" %(' '.join(self.tokens[0])) # here, merged the elements, so that re.escape does not have to do an escape for the entire list
def __repr__(self):
return repr(self.tokens[0])
class AndNode(Node):
def generate(self):
tokens = self.tokens[0]
return '.*'.join(t.generate() for t in tokens[::2]) # change this to the correct form of AND in regex
def __repr__(self):
return ' AND '.join(repr(t) for t in self.tokens[0].asList()[::2])
class OrNode(Node):
def generate(self):
tokens = self.tokens[0]
return '|'.join(t.generate() for t in tokens[::2])
def __repr__(self):
return ' OR '.join(repr(t) for t in self.tokens[0].asList()[::2])
class UpToNode(Node):
def generate(self):
tokens = self.tokens[0]
ret = tokens[0].generate()
print (123123)
word_re = r"\s+\S+"
space_re = r"\s+"
for op, operand in zip(tokens[1::2], tokens[2::2]):
# op contains the parsed "upto" expression
ret += "((%s){0,%d}%s)" % (word_re, op.numberofwords, space_re) + operand.generate()
print ret
return ret
def __repr__(self):
tokens = self.tokens[0]
ret = repr(tokens[0])
for op, operand in zip(tokens[1::2], tokens[2::2]):
# op contains the parsed "upto" expression
ret += " {0-%d WORDS} " % (op.numberofwords) + repr(operand)
return ret
phrase_expr = infixNotation(phrase,
[
((BEFORE | AFTER | JOIN), 2, opAssoc.LEFT,), # (opExpr, numTerms, rightLeftAssoc, parseAction)
(AND, 2, opAssoc.LEFT,),
(OR, 2, opAssoc.LEFT),
],
lpar=Suppress('{'), rpar=Suppress('}')
) # structure of a single phrase with its operators
line_term = Group((LINE_CONTAINS | LINE_STARTSWITH | LINE_ENDSWITH)("line_directive") +
Group(phrase_expr)("phrase")) # basically giving structure to a single sub-rule having line-term and phrase
line_contents_expr = infixNotation(line_term,
[(AND, 2, opAssoc.LEFT,),
(OR, 2, opAssoc.LEFT),
]
) # grammar for the entire rule/sentence
phrase_expr = infixNotation(line_contents_expr.setParseAction(LiteralNode),
[
(upto_expr, 2, opAssoc.LEFT, UpToNode),
(AND, 2, opAssoc.LEFT, AndNode),
(OR, 2, opAssoc.LEFT, OrNode),
])
tests1 = """LINE_CONTAINS overexpressing gene AND other things""".splitlines()
for t in tests1:
t = t.strip()
if not t:
continue
# print(t, 12)
try:
parsed = phrase_expr.parseString(t)
except ParseException as pe:
print(' '*pe.loc + '^')
print(pe)
continue
print (parsed[0], 14)
print (type(parsed[0]))
print(parsed[0].generate(), 15)
This simple code, on running gives the following error-
((['LINE_CONTAINS', ([(['overexpressing', 'gene'], {})], {})],
{'phrase': [(([(['overexpressing', 'gene'], {})], {}), 1)],
'line_directive': [('LINE_CONTAINS', 0)]}), 14)
((['LINE_CONTAINS', ([(['overexpressing', 'gene'], {})], {})],
{'phrase': [(([(['overexpressing', 'gene'], {})], {}), 1)],
'line_directive': [('LINE_CONTAINS', 0)]}), 20)
('LINE_CONTAINS', <, 19)
(([(['overexpressing', 'gene'], {})], {}), , 19)
(, 18)
TypeError: sequence item 1: expected string, ParseResults found (line
29)
(The error code is not completely correct, as angular brackets are not well supported in blockquote here)
So my question is, even though I have written the grammar (using infixnotation) such that it treats LINE_CONTAINS as a line_directive and parse the remaining the line accordingly, why is it not able to parse properly? What is a good way to parse such lines?

Extract data from within parenthesis in python

I know there are many questions with the same title. My situation is a little different. I have a string like:
"Cat(Money(8)Points(80)Friends(Online(0)Offline(8)Total(8)))Mouse(Money(10)Points(10000)Friends(Online(10)Offline(80)Total(90)))"
(Notice that there are parenthesis nested inside another)
and I need to parse it into nested dictionaries like for example:
d["Cat"]["Money"] == 8
d["Cat"]["Points"] = 80
d["Mouse"]["Friends"]["Online"] == 10
and so on. I would like to do this without libraries and regex. If you choose to use these, please explain the code in great detail.
Thanks in advance!
Edit:
Although this code will not make any sense, this is what I have so far:
o_str = "Jake(Money(8)Points(80)Friends(Online(0)Offline(8)Total(8)))Mouse(Money(10)Points(10000)Friends(Online(10)Offline(80)Total(90)))"
spl = o_str.split("(")
def reverseIndex(str1, str2):
try:
return len(str1) - str1.rindex(str2)
except Exception:
return len(str1)
def app(arr,end):
new_arr = []
for i in range(0,len(arr)):
if i < len(arr)-1:
new_arr.append(arr[i]+end)
else:
new_arr.append(arr[i])
return new_arr
spl = app(spl,"(")
ends = []
end_words = []
op = 0
cl = 0
for i in range(0,len(spl)):
print i
cl += spl[i].count(")")
op += 1
if cl == op-1:
ends.append(i)
end_words.append(spl[i])
#break
print op
print cl
print
print end_words
The end words are the sections at the beginning of each statement. I plan on using recursive to do the rest.
Now that was interesting. You really nerd-sniped me on this one...
def parse(tokens):
""" take iterator of tokens, parse to dictionary or atom """
dictionary = {}
# iterate tokens...
for token in tokens:
if token == ")" or next(tokens) == ")":
# token is ')' -> end of dict; next is ')' -> 'leaf'
break
# add sub-parse to dictionary
dictionary[token] = parse(tokens)
# return dict, if non-empty, else token
return dictionary or int(token)
Setup and demo:
>>> s = "Cat(Money(8)Points(80)Friends(Online(0)Offline(8)Total(8)))Mouse(Money(10)Points(10000)Friends(Online(10)Offline(80)Total(90)))"
>>> tokens = iter(s.replace("(", " ( ").replace(")", " ) ").split())
>>> pprint(parse(tokens))
{'Cat': {'Friends': {'Offline': 8, 'Online': 0, 'Total': 8},
'Money': 8,
'Points': 80},
'Mouse': {'Friends': {'Offline': 80, 'Online': 10, 'Total': 90},
'Money': 10,
'Points': 10000}}
Alternatively, you could also use a series of string replacements to turn that string into an actual Python dictionary string and then evaluate that, e.g. like this:
as_dict = eval("{'" + s.replace(")", "'}, ")
.replace("(", "': {'")
.replace(", ", ", '")
.replace(", ''", "")[:-3] + "}")
This will wrap the 'leafs' in singleton sets of strings, e.g. {'8'} instead of 8, but this should be easy to fix in a post-processing step.

How to replace text in curly brackets with another text based on comparisons using Python Regex

I am quiet new to regular expressions. I have a string that looks like this:
str = "abc/def/([default], [testing])"
and a dictionary
dict = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
and using Python RE, I want str in this form, after comparisons of each element in dict to str:
str = "abc/def/(2.7, 2.1)"
Any help how to do it using Python RE?
P.S. its not the part of any assignment, instead it is the part of my project at work and I have spent many hours to figure out solution but in vain.
import re
st = "abc/def/([default], [testing], [something])"
dic = {'abc/def/[default]' : '2.7',
'abc/def/[testing]' : '2.1',
'bcd/xed/[something]' : '3.1'}
prefix_regex = "^[\w*/]*"
tag_regex = "\[\w*\]"
prefix = re.findall(prefix_regex, st)[0]
tags = re.findall(tag_regex, st)
for key in dic:
key_prefix = re.findall(prefix_regex, key)[0]
key_tag = re.findall(tag_regex, key)[0]
if prefix == key_prefix:
for tag in tags:
if tag == key_tag:
st = st.replace(tag, dic[key])
print st
OUTPUT:
abc/def/(2.7, 2.1, [something])
Here is a solution using re module.
Hypotheses :
there is a dictionary whose keys are composed of a prefix and a variable part, the variable part is enclosed in brackets ([])
the values are strings by which the variable parts are to be replaced in the string
the string is composed by a prefix, a (, a list of variable parts and a )
the variable parts in the string are enclosed in []
the variable parts in the string are separated by a comma followed by optional spaces
Python code :
import re
class splitter:
pref = re.compile("[^(]+")
iden = re.compile("\[[^]]*\]")
def __init__(self, d):
self.d = d
def split(self, s):
m = self.pref.match(s)
if m is not None:
p = m.group(0)
elts = self.iden.findall(s, m.span()[1])
return p, elts
return None
def convert(self, s):
p, elts = self.split(s)
return p + "(" + ", ".join((self.d[p + elt] for elt in elts)) + ")"
Usage :
s = "abc/def/([default], [testing])"
d = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
sp = splitter(d)
print(sp.convert(s))
output :
abc/def/(2.7, 2.1)
Regex is probably not required here. Hope this helps
lhs,rhs = str.split("/(")
rhs1,rhs2 = rhs.strip(")").split(", ")
lhs+="/"
print "{0}({1},{2})".format(lhs,dict[lhs+rhs1],dict[lhs+rhs2])
output
abc/def/(2.7,2.1)

How to parse parenthetical trees in python?

I need help with this developing this algorithm that I'm working on. I have a an input of a tree in the following format:
( Root ( AB ( ABC ) ( CBA ) ) ( CD ( CDE ) ( FGH ) ) )
This looks this the following tree.
Root
|
____________
AB CD
| |
__________ ___________
ABC CBA CDE FGH
What the algorithm is suppose to is read the parenthetical format in and give the following output:
Root -> AB CD
AB -> ABC CBA
CD -> CDE FGH
It list the root and its children and all other parents that have children.
I am not able to understand how to start up on this, Can someone help me gimme hint or give some references or links?
Solution: the Tree class from module nltk
(aka Natural Language Toolkit)
Making the actual parsing
This is your input:
input = '( Root ( AB ( ABC ) ( CBA ) ) ( CD ( CDE ) ( FGH ) ) )'
And you parse it very simply by doing:
from nltk import Tree
t = Tree.fromstring(input)
Playing with the parsed tree
>>> t.label()
'Root'
>>> len(t)
2
>>> t[0]
Tree('AB', [Tree('ABC', []), Tree('CBA', [])])
>>> t[1]
Tree('CD', [Tree('CDE', []), Tree('FGH', [])])
>>> t[0][0]
Tree('ABC', [])
>>> t[0][1]
Tree('CBA', [])
>>> t[1][0]
Tree('CDE', [])
>>> t[1][1]
Tree('FGH', [])
As you seen, you can treat each node as a list of subtrees.
To pretty-print the tree
>>> t.pretty_print()
Root
_______|________
AB CD
___|___ ___|___
ABC CBA CDE FGH
| | | |
... ... ... ...
To obtain the output you want
from sys import stdout
def showtree(t):
if (len(t) == 0):
return
stdout.write(t.label() + ' ->')
for c in t:
stdout.write(' ' + c.label())
stdout.write('\n')
for c in t:
showtree(c)
Usage:
>>> showtree(t)
Root -> AB CD
AB -> ABC CBA
CD -> CDE FGH
To install the module
pip install nltk
(Use sudo if required)
A recursive descent parser is a simple form of parser that can parse many grammars. While the entire theory of parsing is too large for a stack-overflow answer, the most common approach to parsing involves two steps: first, tokenisation, which extracts subwords of your string (here, probably words like 'Root', and 'ABC', or brackets like '(' and ')'), and then parsing using recursive functions.
This code parses input (like your example), producing a so-called parse tree, and also has a function 'show_children' which takes the parse tree, and produces the children view of the expression as your question asked.
import re
class ParseError(Exception):
pass
# Tokenize a string.
# Tokens yielded are of the form (type, string)
# Possible values for 'type' are '(', ')' and 'WORD'
def tokenize(s):
toks = re.compile(' +|[A-Za-z]+|[()]')
for match in toks.finditer(s):
s = match.group(0)
if s[0] == ' ':
continue
if s[0] in '()':
yield (s, s)
else:
yield ('WORD', s)
# Parse once we're inside an opening bracket.
def parse_inner(toks):
ty, name = next(toks)
if ty != 'WORD': raise ParseError
children = []
while True:
ty, s = next(toks)
if ty == '(':
children.append(parse_inner(toks))
elif ty == ')':
return (name, children)
# Parse this grammar:
# ROOT ::= '(' INNER
# INNER ::= WORD ROOT* ')'
# WORD ::= [A-Za-z]+
def parse_root(toks):
ty, _ = next(toks)
if ty != '(': raise ParseError
return parse_inner(toks)
def show_children(tree):
name, children = tree
if not children: return
print '%s -> %s' % (name, ' '.join(child[0] for child in children))
for child in children:
show_children(child)
example = '( Root ( AB ( ABC ) ( CBA ) ) ( CD ( CDE ) ( FGH ) ) )'
show_children(parse_root(tokenize(example)))
Try this :
def toTree(expression):
tree = dict()
msg =""
stack = list()
for char in expression:
if(char == '('):
stack.append(msg)
msg = ""
elif char == ')':
parent = stack.pop()
if parent not in tree:
tree[parent] = list()
tree[parent].append(msg)
msg = parent
else:
msg += char
return tree
expression = "(Root(AB(ABC)(CBA))(CD(CDE)(FGH)))"
print toTree(expression)
It returns a dictionary, where the root can be accessed with the key ''. You can then do a simple BFS to print the output.
OUTPUT :
{
'' : ['Root'],
'AB' : ['ABC', 'CBA'],
'Root': ['AB', 'CD'],
'CD' : ['CDE', 'FGH']
}
You will have to eliminate all the whitespaces in the Expression before you start, or ignore the inrrelevant charaters in the expression by adding the following as the very first line in the for-loop :
if char == <IRRELEVANT CHARACTER>:
continue
The above code will run in O(n) time, where n is the length of the expression.
EDIT
Here is the printing function :
def printTree(tree, node):
if node not in tree:
return
print '%s -> %s' % (node, ' '.join(child for child in tree[node]))
for child in tree[node]:
printTree(tree, child)
The desired Output can be achieved by the following :
expression = "(Root(AB(ABC)(CBA))(CD(CDE)(FGH)))"
tree = toTree(expression)
printTree(tree, tree[''][0])
Output
Root -> AB CD
AB -> ABC CBA
CD -> CDE FGH
EDIT
Assuming the node names are not unique, we just have to give new names to the nodes. This can be done using :
def parseExpression(expression):
nodeMap = dict()
counter = 1
node = ""
retExp =""
for char in expression:
if char == '(' or char == ')' :
if (len(node) > 0):
nodeMap[str(counter)] = node;
retExp += str(counter)
counter +=1
retExp += char
node =""
elif char == ' ': continue
else :
node += char
return retExp,nodeMap
The print Function will now change to :
def printTree(tree, node, nodeMap):
if node not in tree:
return
print '%s -> %s' % (nodeMap[node], ' '.join(nodeMap[child] for child in tree[node]))
for child in tree[node]:
printTree(tree, child, nodeMap)
The output can be obtained by using :
expression = " ( Root( SQ ( VBZ ) ( NP ( DT ) ( NN ) ) ( VP ( VB ) ( NP ( NN ) ) ) ))"
expression, nodeMap = parseExpression(expression)
tree = toTree(expression)
printTree(tree, tree[''][0], nodeMap)
Output :
Root -> SQ
SQ -> VBZ NP VP
NP -> DT NN
VP -> VB NP
NP -> NN
I think the most popular solution for parsing in Python is PyParsing. PyParsing comes with a grammar for parsing S-expressions and you should be able to just use it. Discussed in this StackOverflow answer:
Parsing S-Expressions in Python

Categories

Resources