pyparsing parse nested expression with multiple opener and closer [duplicate] - python

I'd like to use pyparsing to parse an expression of the form: expr = '(gimme [some {nested [lists]}])', and get back a python list of the form: [[['gimme', ['some', ['nested', ['lists']]]]]]. Right now my grammar looks like this:
nestedParens = nestedExpr('(', ')')
nestedBrackets = nestedExpr('[', ']')
nestedCurlies = nestedExpr('{', '}')
enclosed = nestedParens | nestedBrackets | nestedCurlies
Presently, enclosed.searchString(expr) returns a list of the form: [[['gimme', ['some', '{nested', '[lists]}']]]]. This is not what I want because it's not recognizing the square or curly brackets, but I don't know why.

Here's a pyparsing solution that uses a self-modifying grammar to dynamically match the correct closing brace character.
from pyparsing import *
data = '(gimme [some {nested, nested [lists]}])'
opening = oneOf("( { [")
nonBracePrintables = ''.join(c for c in printables if c not in '(){}[]')
closingFor = dict(zip("({[",")}]"))
closing = Forward()
# initialize closing with an expression
closing << NoMatch()
closingStack = []
def pushClosing(t):
closingStack.append(closing.expr)
closing << Literal( closingFor[t[0]] )
def popClosing():
closing << closingStack.pop()
opening.setParseAction(pushClosing)
closing.setParseAction(popClosing)
matchedNesting = nestedExpr( opening, closing, Word(alphas) | Word(nonBracePrintables) )
print matchedNesting.parseString(data).asList()
prints:
[['gimme', ['some', ['nested', ',', 'nested', ['lists']]]]]
Updated: I posted the above solution because I had actually written it over a year ago as an experiment. I just took a closer look at your original post, and it made me think of the recursive type definition created by the operatorPrecedence method, and so I redid this solution, using your original approach - much simpler to follow! (might have a left-recursion issue with the right input data though, not thoroughly tested):
from pyparsing import *
enclosed = Forward()
nestedParens = nestedExpr('(', ')', content=enclosed)
nestedBrackets = nestedExpr('[', ']', content=enclosed)
nestedCurlies = nestedExpr('{', '}', content=enclosed)
enclosed << (Word(alphas) | ',' | nestedParens | nestedBrackets | nestedCurlies)
data = '(gimme [some {nested, nested [lists]}])'
print enclosed.parseString(data).asList()
Gives:
[['gimme', ['some', ['nested', ',', 'nested', ['lists']]]]]
EDITED:
Here is a diagram of the updated parser, using the railroad diagramming support coming in pyparsing 3.0.

This should do the trick for you. I tested it on your example:
import re
import ast
def parse(s):
s = re.sub("[\{\(\[]", '[', s)
s = re.sub("[\}\)\]]", ']', s)
answer = ''
for i,char in enumerate(s):
if char == '[':
answer += char + "'"
elif char == '[':
answer += "'" + char + "'"
elif char == ']':
answer += char
else:
answer += char
if s[i+1] in '[]':
answer += "', "
ast.literal_eval("s=%s" %answer)
return s
Comment if you need more

Related

Python : How to translate?

the program is when user input"8#15#23###23#1#19###9#20"
output should be "HOW WAS IT"
However,it could not work to show space(###).
enter code here
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
def from_abstract(s):
result = ''
for word in s.split('*'):
result = result +ABSTRACT_SHIFTED.get(word)
return result
This would do the trick:
#!/usr/bin/env python
InputString = "8#15#23###23#1#19###9#20"
InputString = InputString.replace("###", "##")
InputString = InputString.split("#")
DecodedMessage = ""
for NumericRepresentation in InputString:
if NumericRepresentation == "":
NumericRepresentation = " "
DecodedMessage += NumericRepresentation
continue
else:
DecodedMessage += chr(int(NumericRepresentation) + 64)
print(DecodedMessage)
Prints:
HOW WAS IT
you can also use a regex
import re
replacer ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
reversed = {value:key for key,value in replacer.items()}
# Reversed because regex is greedy and it will match 1 before 15
target = '8#15#23###23#1#19###9#20'
pattern = '|'.join(map(lambda x: x + '+', list(reversed.keys())[::-1]))
repl = lambda x: reversed[x.group(0)]
print(re.sub(pattern, string=target, repl=repl))
And prints:
HOW WAS IT
With a couple minimal changes to your code it works.
1) split on '#', not '*'
2) retrieve ' ' by default if a match isn't found
3) use '##' instead of '###'
def from_abstract(s):
result = ''
for word in s.replace('###','##').split('#'):
result = result +ABSTRACT_SHIFTED.get(word," ")
return result
Swap the key-value pairs of ABSTRACT and use simple split + join on input
ip = "8#15#23###23#1#19###9#20"
ABSTRACT = dict((v,k) for k,v in ABSTRACT.items())
''.join(ABSTRACT.get(i,' ') for i in ip.split('#')).replace(' ', ' ')
#'HOW WAS IT'
The biggest challenge here is that "#" is used as a token separator and as the space character, you have to know the context to tell which you've got at any given time, and that makes it difficult to simply split the string. So write a simple parser. This one will accept anything as the first character in a token and then grab everything until it sees the next "#".
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
user_input = "8#15#23###23#1#19###9#20"
def from_abstract(s):
result = []
while s:
print 'try', s
# tokens are terminated with #
idx = s.find("#")
# ...except at end of line
if idx == -1:
idx = len(s) - 1
token = s[:idx]
s = s[idx+1:]
result.append(ABSTRACT_SHIFTED.get(token, ' '))
return ''.join(result)
print from_abstract(user_input)

Tokenize Function Not Working As Expected - Python

These are the instructions:
Write a function tokenize(input_string) that takes a string containing an expression and returns a list of tokens. Tokens in this small language will be delimited by whitespace, and so any time there a space (or several spaces in a row) in the input string, we want to split around that.
You should not use the built-in string operation split, but rather should structure your code using the tools we have developed so far.
When all is said and done, For example, running the tokenizer on this string:
tokenize("2 2 + 3 4 / .5 0.2 3.2 + - COS")
should return:
['2', '2', '+', '3', '4', '/', '.5', '0.2', '3.2', '+', '-', 'COS']
This is my code:
def tokenize(input_string):
tokens = []
token = ""
for char in input_string:
if char == " " and input_string[-1] != char and token != "":
tokens.append(token)
token = ""
elif input_string[-1] == char:
tokens.append(token + char)
elif char != " ":
token += char
return tokens
My code works properly with the given example and similar arguments, but when i run something like:
tokenize("pi load store load")
i get:
['pi', 'load', 'loa', 'store', 'load']
What's the bug? Tried finding it with print statements in various parts of the function to no avail. Also any advice on how to better organize the if statements will be greatly appreciated. Thanks in advance for the help.
I think your flaw is in the line elif input_string[-1] == char:.
If I'm understanding you correctly, you are trying to use this elif case to check if you are at the end of the string, and if you are, to add the last token in the string to your list of tokens.
However, if you have the last character in your string appear more than once, it will go into this case every time; that's why you have both 'loa' and 'load' in your list.
My suggestion is to remove all of your checks for the current character being the same as the last character in the string, and add
if token != "":
tokens.append(token)
after your for loop.
To add to the Izaak Weiss answer, please simplify your logic about the checks, this could be a solution:
def tokenize(input_string):
tokens = []
token = ''
for char in input_string:
if char == ' ': # Possible token termination
if token != '':
tokens.append(token)
token = ''
else:
token += char
# Last token
if token != '':
tokens.append(token)
return tokens
Here are 2 approaches:
The "plain" one that you were attempting to implement (tokenizing the string "manually")
A little bit more advanced one that uses [Python]: str.find(sub[, start[, end]]) (also rfind)
Of course there are others as well (e.g. ones that use recursion, or even regular expressions), but they probably are too advanced.
def tokenize_plain(input_string):
tokens = list()
current_token = ""
for char in input_string:
if char == " ":
if current_token:
tokens.append(current_token)
current_token = ""
else:
current_token += char
if current_token:
tokens.append(current_token)
return tokens
def tokenize_find(input_string):
tokens = list()
start = 0
end = input_string.find(" ", start)
while end != -1:
if end == start:
start += 1
else:
tokens.append(input_string[start: end])
start = end
end = input_string.find(" ", start)
end = input_string.rfind(" ", start)
if end == -1:
tokens.append(input_string[start:])
else:
tokens.append(input_string[start: end])
return tokens
if __name__ == "__main__":
for tokenize in [tokenize_plain, tokenize_find]:
for text in ["pi load store load", "2 2 + 3 4 / .5 0.2 3.2 + - COS"]:
print("{}('{}') = {}".format(tokenize.__name__, text, tokenize(text)))
Output:
c:\Work\Dev\StackOverflow\q46372240>c:\Work\Dev\VEnvs\py35x64_test\Scripts\python.exe a.py
tokenize_plain('pi load store load') = ['pi', 'load', 'store', 'load']
tokenize_plain('2 2 + 3 4 / .5 0.2 3.2 + - COS') = ['2', '2', '+', '3', '4', '/', '.5', '0.2', '3.2', '+', '-', 'COS']
tokenize_find('pi load store load') = ['pi', 'load', 'store', 'load']
tokenize_find('2 2 + 3 4 / .5 0.2 3.2 + - COS') = ['2', '2', '+', '3', '4', '/', '.5', '0.2', '3.2', '+', '-', 'COS']

Python for loop - stripping line as you go

I am trying to strip a line of code so that only the comment at the end is saved. Because # signs can be included within "" marks, to do this I am trying to cycle through the line catching pairs of " marks so that it ignores any # marks within "" marks. When I use a code visualiser on my code below, after the second for loop it seems to go pack to processing s as if it has just stripped the first " mark. I can't see what I'm doing wrong here, because the print statement I have included on line 19 shows that s has been stripped to after the second ", but when the code returns to the top, it starts cycling again from after the first ". Any idea of what I am doing wrong here?
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
quoteCount = 0
for char in s:
if quoteCount%2 == 0:
if char == '#':
s = s[s.index('#'):]
break
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
for char in s:
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
print(s)
break
print(s)
If I understand your question correctly you only want to keep the last comment (#lots of hash(#) symbols here).
To do this you don't need the nested for loop.
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
quoteCount = 0
for char in s:
if quoteCount%2 == 0:
if char == '#':
s = s[s.index('#'):]
break
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
print(s)
Easier to remove the quoted strings with a regular expression:
import re
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
pattern = r'"[^"]*"'
s = re.sub(pattern, '', s)
print s[s.index('#'):]
Output:
#lots of hash(#) symbols here
Your code is overly complicated so I suggest you use an alternative method to finding the comment like the already mentioned regex one or the one I came up with.
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
s = s[s.rfind('"') + 1:] # Get to the last quotation mark
if s.find('#') >= 0: # The first # sign should start the comment if there is one
s = s[s.find('#'):]
else:
s = '' # No comment was found
print(s)

Sql to Pymongo generation using Pyparsing

I am using the "awesomest" parsing library in the world existing right now. Pyparsing. The problem at hand is to generate a PyMongo dictionary from a given SQL string (For select statements). The grammar def I am using is following :
sql_stmt = (select_key_word + ('*' | column_list).setResultsName
("columns") + form_key_word + table_name_list.setResultsName
("collections") +
Optional(where_condition, "").setResultsName("where"))
Here the select_key_word, column_list etc. constructs are valid grammar defs. and using this i can parse a string like "Select * from collection_1 where (Sal = 1000 or Sal=5000) AND Car>2"
The problem i have is that, the where part is being parsed is like this :
[[u'where', [u'(', [u'Sal', '=', u'1000'], 'or', [u'Sal', '=', u'5000'], u')'], 'and', [u'Car', '>', u'2']]]
Which is fine if i want it translated into something sqlish. But a valid representation of that same in pymongo would be something like this :
{u'$or': [{u'$and': [{u'Sal': u'1000'}, {u'Sal': u'5000'}]}, {u'Car': {u'$gte': u'2'}}]}
That is where I am stuck. Can anybody give me a direction? it seems to me that setParseAction will be a way to go, but just can't figure that out
the code for the where_contidion is :
where_expr = Forward()
and_keyword = get_conjunction_as_grammar("and")
or_keyword = get_conjunction_as_grammar("or")
in_operation = get_operation_as_grammar("in")
column_value = get_real_number_as_grammar() | get_int_as_grammar() | \
quotedString
binary_operator = get_bin_op_as_grammar()
col_name = get_column_name_as_grammar()
where_condn = Group(
(col_name + binary_operator + column_value) |
(col_name + in_operation + "(" + delimitedList(column_value) + ")" ) |
("(" + where_expr + ")")
)
where_expr << where_condn + ZeroOrMore((and_keyword | or_keyword)
+ where_expr)
where_condition = Group(CaselessLiteral("where") + where_expr)
Thanks in advance. Please let me know if you need any other information.
Yes, parse actions are just the thing for this kind of project. Also, if you are trying to evaluate an expression that can have parenthetical nesting of operations of varying precedence, then operatorPrecedence is often a handy shortcut:
from pyparsing import *
and_keyword = CaselessKeyword("and")
or_keyword = CaselessKeyword("or")
in_operation = CaselessKeyword("in")
value = quotedString | Word(alphanums)
comparisonOp = oneOf("= != > < >= <=")
LPAR,RPAR = map(Suppress,"()")
valueList = LPAR + delimitedList(value) + RPAR
comparisonExpr = value + comparisonOp + value | value + in_operation + Group(valueList)
def makePymongoComparison(tokens):
v1,op,v2 = tokens
if op != 'in':
if op != '=':
op = {
"!=" : "$ne",
">" : "$gt",
"<" : "$lt",
">=" : "$gte",
"<=" : "$lte",
}[op]
v2 = "{'%s': '%s'}" % (op, v2)
return "{'%s': '%s'}" % (v1, v2)
else:
return "{'%s': {'$in': [%s]}}" % (v1, ','.join("'%s'"%v for v in v2))
comparisonExpr.setParseAction(makePymongoComparison)
def handleBinaryOp(op):
def pa(tokens):
return "{'$%s': %s}" % (op, ', '.join(tokens.asList()[0][::2]))
return pa
handleAnd = handleBinaryOp("and")
handleOr = handleBinaryOp("or")
whereOperand = comparisonExpr
where_expr = operatorPrecedence(whereOperand,
[
(and_keyword, 2, opAssoc.LEFT, handleAnd),
(or_keyword, 2, opAssoc.LEFT, handleOr),
])
where_condition = Group(CaselessLiteral("where") + where_expr)
print where_expr.parseString("(Sal = 1000 or Sal=5000) AND Car>2")[0]
print where_expr.parseString("(Sal = 1000 or Sal=5000) AND Car in (1,2,3)")[0]
prints:
{'$and': {'$or': {'Sal': '1000'}, {'Sal': '5000'}}, {'Car': '{'$gt': '2'}'}}
{'$and': {'$or': {'Sal': '1000'}, {'Sal': '5000'}}, {'Car': {'$in': ['1','2','3']}}}
Still needs a few tweaks, but I hope this gets you further along.

How can I use pyparsing to parse nested expressions that have multiple opener/closer types?

I'd like to use pyparsing to parse an expression of the form: expr = '(gimme [some {nested [lists]}])', and get back a python list of the form: [[['gimme', ['some', ['nested', ['lists']]]]]]. Right now my grammar looks like this:
nestedParens = nestedExpr('(', ')')
nestedBrackets = nestedExpr('[', ']')
nestedCurlies = nestedExpr('{', '}')
enclosed = nestedParens | nestedBrackets | nestedCurlies
Presently, enclosed.searchString(expr) returns a list of the form: [[['gimme', ['some', '{nested', '[lists]}']]]]. This is not what I want because it's not recognizing the square or curly brackets, but I don't know why.
Here's a pyparsing solution that uses a self-modifying grammar to dynamically match the correct closing brace character.
from pyparsing import *
data = '(gimme [some {nested, nested [lists]}])'
opening = oneOf("( { [")
nonBracePrintables = ''.join(c for c in printables if c not in '(){}[]')
closingFor = dict(zip("({[",")}]"))
closing = Forward()
# initialize closing with an expression
closing << NoMatch()
closingStack = []
def pushClosing(t):
closingStack.append(closing.expr)
closing << Literal( closingFor[t[0]] )
def popClosing():
closing << closingStack.pop()
opening.setParseAction(pushClosing)
closing.setParseAction(popClosing)
matchedNesting = nestedExpr( opening, closing, Word(alphas) | Word(nonBracePrintables) )
print matchedNesting.parseString(data).asList()
prints:
[['gimme', ['some', ['nested', ',', 'nested', ['lists']]]]]
Updated: I posted the above solution because I had actually written it over a year ago as an experiment. I just took a closer look at your original post, and it made me think of the recursive type definition created by the operatorPrecedence method, and so I redid this solution, using your original approach - much simpler to follow! (might have a left-recursion issue with the right input data though, not thoroughly tested):
from pyparsing import *
enclosed = Forward()
nestedParens = nestedExpr('(', ')', content=enclosed)
nestedBrackets = nestedExpr('[', ']', content=enclosed)
nestedCurlies = nestedExpr('{', '}', content=enclosed)
enclosed << (Word(alphas) | ',' | nestedParens | nestedBrackets | nestedCurlies)
data = '(gimme [some {nested, nested [lists]}])'
print enclosed.parseString(data).asList()
Gives:
[['gimme', ['some', ['nested', ',', 'nested', ['lists']]]]]
EDITED:
Here is a diagram of the updated parser, using the railroad diagramming support coming in pyparsing 3.0.
This should do the trick for you. I tested it on your example:
import re
import ast
def parse(s):
s = re.sub("[\{\(\[]", '[', s)
s = re.sub("[\}\)\]]", ']', s)
answer = ''
for i,char in enumerate(s):
if char == '[':
answer += char + "'"
elif char == '[':
answer += "'" + char + "'"
elif char == ']':
answer += char
else:
answer += char
if s[i+1] in '[]':
answer += "', "
ast.literal_eval("s=%s" %answer)
return s
Comment if you need more

Categories

Resources