I am using pyparsing to parse a hex string and I am searching for an automatic way of print the parser tree.
A near approach is command dump but it print a lot of duplicated info.
For example:
from pyparsing import * #Word, Optional, OneOrMore, Group, ParseException
data = Forward()
arrayExpr = Forward()
def data_array(s,l,t):
n = int(t[0], 16)
arrayExpr << ( n * data)
return t[0]
array = Word(hexnums, exact=2).setParseAction(data_array) + arrayExpr
data << (Literal('01') + array.setResultsName('array')
| Literal('03') + Word(hexnums, exact=2)('char')
| Literal('04') + Word(hexnums, exact=2)('boolean'))
frame = (Word(hexnums, exact=2)('id') \
+ data('data'))('frame')
result = frame.parseString("02010203010302");
print result.dump()
The goal is that result of result.dump() was something similar to
- frame: ['02', '01', '03', '03', '01', '04', '02', '03', '02']
- id: 02
- array: ['03', '03', '01', '04', '02', '03', '02']
- char: 01
- boolean: 02
- char: 02
The pretty print isn't mandatory, the pretended is the tree structure.
Is there a way of make this print or I will need to had a setParseAction for all rules ?
Looks like you'll need a setParseAction for each of the rules.
From parsing to object hierarchy: "Attach parse actions to each expression, but here is the trick: use a class instead of a function. The class's init method will get called, and return an instance of that class. "
Prefer to add an answer instead of the edit the question, to much code ..
Isn't perfect, the levels don't get right and the classes could be discarded if I could get the resultsName from printAction. Maybe should create a new question :-/
If someone use it and improve please say how :)
#!/usr/bin/python
from pyparsing import * #Word, Optional, OneOrMore, Group, ParseException
data = Forward()
level = 0
arrayExpr = Forward()
def data_array(s,l,t):
n = int(t[0], 16)
arrayExpr << ( n * data)
return t[0]
class TreeChild(object):
def __init__(self,t):
self.args = t
def __str__(self):
ret = " %s: " % self.name
return ' ' * level + ret + self.args[0] + "\n"
class TreeBranch(object):
def __init__(self,t):
self.args = t
def __str__(self):
global level
level = level + 1
childs = " ".join(map(str,self.args))
level = level - 1
ret = " %s: " % self.name + '\n'
return ' ' * level + ret + childs + "\n"
class Frame(TreeBranch):
name = 'frame'
class Char(TreeChild):
name = 'char'
class Boolean(TreeChild):
name = 'boolean'
class Id(TreeChild):
name = 'id'
class Array(TreeBranch):
name = 'array'
array = Suppress(Word(hexnums, exact=2).setParseAction(data_array)) + arrayExpr
data << (Suppress(Literal('01')) + array.setResultsName('array').setParseAction(Array)
| Suppress(Literal('03')) + Word(hexnums, exact=2)('char').setParseAction(Char)
| Suppress(Literal('04')) + Word(hexnums, exact=2)('boolean').setParseAction(Boolean))
frame = (Word(hexnums, exact=2)('id').setParseAction(Id) \
+ data('data'))('frame').setParseAction(Frame)
result = frame.parseString("020103030104020302");
print result[0]
Related
I want to define my own function as below:
def myown(df, ADD1, ADD2 = None, OtherArgument_1, OtherArgument_2):
tmp = df
tmp['NEWADD'] = (tmp['ADD1'] + ' ' + tmp['ADD2']).str.strip()
return tmp
I know this is incorrect so I can add if statement in the function.
def myown(df, ADD1, ADD2 = None, OtherArgument_1, OtherArgument_2):
tmp = df
if ADD2 == None:
tmp['NEWADD'] = tmp[ADD1].str.strip()
else:
tmp['NEWADD'] = (tmp[ADD1] + ' ' + tmp[ADD2]).str.strip()
However, If I don know how many ADD inputs at first, how can I modify this?
For example, there are 5 ADD need to be combined this time and next time it may be 3. It is difficult to re-write function each time like this:
def myown(df, ADD1, ADD2, ADD3, ADD4, ADD5, OtherArgument_1, OtherArgument_2):
tmp = df
tmp['NEWADD'] = (tmp[ADD1] + ' ' + tmp[ADD2] + ' ' + tmp[ADD3] + ' ' + tmp[ADD4] + ' ' + tmp[ADD5]).str.strip()
You can accomplish this by using loops and lists like this:
def myown(df, add_args, OtherArgument_1, OtherArgument_2):
tmp = df
new_add = ''
for i in add_args:
new_add = new_add + tmp[i].str.strip() + ''
tmp['NEWADD'] = new_add
Your add_args parameter must be a list, which looks like this:
add_args = [ADD1, ADD2, ADDn]
I'm writing a program that can parse math papers written in .tex files. Here are what I want:
The program is supposed to detect the beginning, the end, sections, subsections, subsubsections, theorems, lemmas, definitions, conjectures, corollaries, proposition, exercises, notations and examples in a math paper and ignore the rest of the contents to produce a summary.
In the beginning the program is supposed to retain all characters until reaching token MT. In this case the lever should preserve the token and enter ig mode. Then it should ignore all characters unless it detects a theorem/lemma/definition/conjecture/corollary/example/exercise/notation/proposition, in which case it temporarily enters the INITIAL mode and retain it or a (sub/subsub)section in which case it should temporarily enter the sec mode.
\newtheorem{<name>}{<heading>}[<counter>] and \newtheorem{<name>}[<counter>]{<heading>} are detected as TH ptext THCC ptext THC ptext and TH ptext THCS ptext THSC ptext THC respectively where ptext is a bunch of TEXT.
import sys
import logging
from ply.lex import TOKEN
if sys.version_info[0] >= 3:
raw_input = input
tokens = (
'BT', 'BL', 'BD', 'BCONJ', 'BCOR', 'BE', 'ET', 'EL', 'ED', 'ECONJ', 'ECOR', 'EE', 'SEC', 'SSEC', 'SSSEC', 'ES', 'TEXT','ITEXT','BIBS','MT','BN','EN','BEXE','EEXE','BP','EP','TH','THCS','THSC','THCC','THC',
)
states = (('ig', 'exclusive'), ('sec', 'exclusive'), ('th', 'exclusive'), ('tht','exclusive'),('thc','exclusive'))
logging.basicConfig(
level = logging.DEBUG,
filename = "lexlog.txt",
filemode = "w",
format = "%(filename)10s:%(lineno)4d:%(message)s"
)
log = logging.getLogger()
th_temp = ''
thn_temp = ''
term_dic = {'Theorem':'','Lemma':'','Corollary':'','Definition':'','Conjecture':'','Example':'','Exercise':'','Notation':'','Proposition':''}
idb_list = ['','','','','','','','','']
ide_list = ['','','','','','','','','']
bb = r'\\begin\{'
eb = r'\\end\{'
ie = r'\}'
def finalize_terms():
global idb_list
global ide_list
if term_dic['Theorem'] != '':
idb_list[0] = bb + term_dic['Theorem'] + ie
ide_list[0] = eb + term_dic['Theorem'] + ie
if term_dic['Lemma'] != '':
idb_list[1] = bb + term_dic['Lemma'] + ie
ide_list[1] = eb + term_dic['Lemma'] + ie
if term_dic['Corollary'] != '':
idb_list[2] = bb + term_dic['Corollary'] + ie
ide_list[2] = eb + term_dic['Corollary'] + ie
if term_dic['Definition'] != '':
idb_list[3] = bb + term_dic['Definition'] + ie
ide_list[3] = eb + term_dic['Definition'] + ie
if term_dic['Conjecture'] != '':
idb_list[4] = bb + term_dic['Conjecture'] + ie
ide_list[4] = eb + term_dic['Conjecture'] + ie
if term_dic['Example'] != '':
idb_list[5] = bb + term_dic['Example'] + ie
ide_list[5] = eb + term_dic['Example'] + ie
if term_dic['Exercise'] != '':
idb_list[6] = bb + term_dic['Exercise'] + ie
ide_list[6] = eb + term_dic['Exercise'] + ie
if term_dic['Notation'] != '':
idb_list[7] = bb + term_dic['Notation'] + ie
ide_list[7] = eb + term_dic['Notation'] + ie
if term_dic['Proposition'] != '':
idb_list[8] = bb + term_dic['Proposition'] + ie
ide_list[8] = eb + term_dic['Proposition'] + ie
print(idb_list)
print(ide_list)
Here are some of the parsing functions:
def t_TH(t):
r'\\newtheorem\{'
t.lexer.begin('th')
return t
def t_th_THCS(t):
r'\}\['
t.lexer.begin('thc')
return t
def t_tht_THC(t):
r'\}'
if term_dic.has_key(thn_temp) == False:
print(f"{thn_temp} is unknown!")
elif len(th_temp) == 0:
print(f"No abbreviation for {thn_temp} is found!")
else:
term_dic[thn_temp] = th_temp
print(f"The abbreviation for {thn_temp} is {th_temp}!")
th_temp = ''
thn_temp = ''
t.lexer.begin('INITIAL')
return t
def t_th_THCC(t):
r'\}\{'
t.lexer.begin('tht')
return t
def t_thc_THSC(t):
r'\]\{'
t.lexer.begin('tht')
return t
#TOKEN(idb_list[0])
def t_ig_BT(t):
t.lexer.begin('INITIAL')
return t
#TOKEN(ide_list[0])
def t_ET(t):
t.lexer.begin('ig')
return t
def t_INITIAL_sec_thc_TEXT(t):
r'[\s\S]'
return t
def t_th_TEXT(t):
r'[\s\S]'
th_temp = th_temp + t.value()
return t
def t_tht_TEXT(t):
r'[\s\S]'
thn_temp = thn_temp + t.value()
return t
def t_ig_ITEXT(t):
r'[\s\S]'
pass
import ply.lex as lex
lex.lex(debug=True, debuglog = log)
Here are the errors:
ERROR: /Users/CatLover/Documents/Python_Beta/TexExtractor/texlexparse.py:154: No regular expression defined for rule 't_ET'
I don't know why the regular expression defined for 't_ET' etc using #TOKEN do not work.
Ply is a parser generator. It takes your parser/lexer description and compiles a parser/lexer from it. You cannot change the description of the language during the parse.
In this particular case, you might be better off writing a streaming ("online") scanner. But if you want to use Ply, then you will be better off not trying to modify the grammar to ignore parts of the input. Just parse the entire input and ignore the parts you're not interested in. You'll probably find that the code is much simpler.
So, i have some expressions in EBNF form for parsing some systems of differential equations
END = Literal(';').suppress()
POINT = Literal('.')
COMMA = Literal(',').suppress()
COLON = Word(':', max=1).suppress()
EQUAL = Literal('=').suppress()
VARNAME = Word(alphas, max=1)
NATNUM = Word(nums) # 1234567890
SIGN = oneOf('+ -')
OPER = oneOf('+ - * / ^ ')
REALNUM = Combine(Optional(SIGN) + NATNUM + Optional(POINT + NATNUM)) # Real Numbers 2.3, 4.5
STEP = Dict(Group('Step' + COLON + REALNUM + END)) # Step: 0.01 ;
RANGE = Dict(Group('Range' + COLON + REALNUM + END)) # Range: 2.0 ;
VARINIT = Group(VARNAME + Suppress('=') + REALNUM) # x=32.31
ZEROVAR = Dict(Group('Vars0' + COLON + VARINIT + Optional(COMMA + VARINIT) + END))
COEFF = Dict(Group('Coeff' + COLON + VARINIT + Optional(COMMA + VARINIT) + END))
EXPESS = Forward()
EXPESS << Combine((REALNUM | VARNAME) + ZeroOrMore(OPER + EXPESS), adjacent=False)
IDENT = Combine('d'+VARNAME)
FUNC = Group(IDENT + EQUAL + EXPESS)
DIFUR = Dict(Group('Exp' + COLON + FUNC + ZeroOrMore(COMMA + FUNC) + END))
STATE = Suppress("Start") + DIFUR + ZEROVAR + COEFF + STEP + RANGE + Suppress("Stop")
I'd like to receive such kind of JSON by parsing the finally STATE expression:
{
'Vars0': {
'y', '0.55',
'x', '0.02',
},
'Exp': {
'dx': 'a*x-y',
'dy': 'b*x-y',
'dz':'800-2*4*x+z'
},
'Range': '2.0',
'Step': '0.05',
'Coeff': {
'a': '5',
'b': '2'
}
}
But instead i've got some thing ugly like this for example 'Vars0': ([(['y', '0.55'], {}), (['x', '0.02'], {})], {}) and etc.
What is my stupid mistake?
p.s. parsing plain text for parsing can be like this
What you have isn't JSON, it's a Python dictionary variable, which fortunately means it can be pretty printed with the pprint module.
Have a look, specifically, at pprint.pprint: https://docs.python.org/2/library/pprint.html#pprint.pprint .
Setting an indent of 4 and a width of 1 might produce something pleasing to you. Example: https://ideone.com/pYESaW
For learning purposes, I'm trying to convert a Chef interpreter project to python 3.4 and trying to wrangle the libraries involved into their newest versions, but when it comes to funcparserlib I'm a little over my head.
Here's the Chef script:
from pprint import pprint
from collections import namedtuple
import re
import logging
import funcparserlib.parser as p
from funcparserlib.lexer import make_tokenizer
from funcparserlib.lexer import Spec
from funcparserlib.contrib.lexer import space, newline
from funcparserlib.contrib.common import sometok, unarg
from common import *
log = logging.getLogger('preserve.chefparser')
#log.addHandler(logging.StreamHandler())
#log.setLevel(logging.DEBUG)
pos = 0
# order matters
instruction_spec = [
Spec(x.lower().split()[0], x) for x in [
'Take', 'Put', 'Fold', 'Add', 'Remove', 'Combine', 'Divide', 'Stir', 'Mix', 'Clean', 'Pour', 'Set aside', 'Refrigerate', 'from', 'the', 'for', 'contents of the', 'until', 'refrigerator', 'minute', 'minutes', 'hour', 'hours', 'well'
]
]
instruction_spec.insert(0, Spec('to', r'to'))
instruction_spec.insert(0, Spec('into', r'into'))
instruction_spec.insert(0, Spec('add_dry', 'Add dry ingredients'))
instruction_spec.insert(0, Spec('liquefy', 'Liquefy|Liquify'))
instruction_spec.append(Spec('serve_with', r'Serve with'))
instruction_spec.append(Spec('bowl', 'mixing bowl'))
instruction_spec.append(Spec('dish', 'baking dish'))
instruction_spec.append(space)
instruction_spec.append(Spec('string', '[A-Za-z]+'))
instruction_spec.append(Spec('ordinal', '[0-9]+(st|nd|rd|th)'))
instruction_spec.append(Spec('number', '[0-9]+'))
tokens = [
Spec('ingredients_start', 'Ingredients'),
Spec('method_start', r'^Method', re.MULTILINE),
Spec('dry_measure', r' g | kg | pinch[es]? '),
Spec('liquid_measure', r' ml | l | dash[es]? '),
Spec('mix_measure', r'cup[s]?|teaspoon[s]?|tablespoon[s]?'),
Spec('measure_type', 'heaped|level'),
# TODO hours minutes
Spec('cooking_time', r'Cooking time:'),
# TODO gas mark
Spec('oven', r'Pre\-heat oven to'),
Spec('oven_temp', 'degrees Celcius'),
# serve is treated separate here as it is
# not necessary for it to appear
# following 'Method.'
# But it is treated as just another
# instruction by the interpreter
Spec('serve', r'^Serves', re.MULTILINE),
Spec('number', '[0-9]+'),
space,
Spec('period', r'\.'),
Spec('string', r'[^\.\r\n]+'),
]
def tokenize_minus_whitespace(token_list, input):
return [x for x in make_tokenizer(token_list)(input) if x.type not in ['space']]
def tokenize_instruction(spec):
return tokenize_minus_whitespace(instruction_spec, spec)
def tokenize(input):
return tokenize_minus_whitespace(tokens, input)
def parse_instruction(spec):
string = p.oneplus(sometok('string')) >> (lambda x: ' '.join(x))
ordinal = sometok('ordinal')
bowl = sometok('bowl')
the = sometok('the')
dish = sometok('dish')
to = sometok('to')
into = sometok('into')
concat = lambda list: ' '.join(list)
take_i = sometok('take') + (p.oneplus(string) >> concat) + sometok('from') + sometok('refrigerator')
put_i = sometok('put') + p.skip(p.maybe(the)) + (p.oneplus(string) >> concat) + p.skip(into) + p.maybe(ordinal|the) + bowl
liquefy_1 = sometok('liquefy') + sometok('contents') + p.maybe(ordinal) + bowl
liquefy_2 = sometok('liquefy') + (p.oneplus(string) >> concat)
liquefy_i = liquefy_1 | liquefy_2
pour_i = sometok('pour') + sometok('contents') + p.maybe(ordinal) + bowl + sometok('into') + the + p.maybe(ordinal) + dish
fold_i = sometok('fold') + p.skip(p.maybe(the)) + (p.oneplus(string) >> concat) + into + p.maybe(ordinal|the) + bowl
# cleanup repitition
add_i = sometok('add') + (p.oneplus(string) >> concat) + p.maybe(to + p.maybe(ordinal|the) + bowl)
remove_i = sometok('remove') + (p.oneplus(string) >> concat) + p.maybe(sometok('from') + p.maybe(ordinal|the) + bowl)
combine_i = sometok('combine') + (p.oneplus(string) >> concat) + p.maybe(into + p.maybe(ordinal|the) + bowl)
divide_i = sometok('divide') + (p.oneplus(string) >> concat) + p.maybe(into + p.maybe(ordinal|the) + bowl)
add_dry_i = sometok('add_dry') + p.maybe(to + p.maybe(ordinal|the) + bowl)
stir_1 = sometok('stir') + p.maybe(the + p.maybe(ordinal|the) + bowl) + sometok('for') + sometok('number') + (sometok('minute')|sometok('minutes'))
stir_2 = sometok('stir') + (p.oneplus(string) >> concat) + into + the + p.maybe(ordinal) + bowl
stir_i = stir_1 | stir_2
mix_i = sometok('mix') + p.maybe(the + p.maybe(ordinal) + bowl) + sometok('well')
clean_i = sometok('clean') + p.maybe(ordinal|the) + bowl
loop_start_i = (sometok('string') + p.maybe(the) + (p.oneplus(string) >> concat)) >> (lambda x: ('loop_start', x))
loop_end_i = (sometok('string') + p.maybe(p.maybe(the) + (p.oneplus(string) >> concat)) + sometok('until') + string) >> (lambda x: ('loop_end', x))
set_aside_i = sometok('set') >> (lambda x: (x, None))
serve_with_i = sometok('serve_with') + (p.oneplus(string) >> concat)
refrigerate_i = sometok('refrigerate') + p.maybe(sometok('for') + sometok('number') + (sometok('hour')|sometok('hours')))
instruction = ( take_i
| put_i
| liquefy_i
| pour_i
| add_i
| fold_i
| remove_i
| combine_i
| divide_i
| add_dry_i
| stir_i
| mix_i
| clean_i
| loop_end_i # -| ORDER matters
| loop_start_i # -|
| set_aside_i
| serve_with_i
| refrigerate_i
) >> (lambda x: Instruction(x[0].lower().replace(' ', '_'), x[1:]))
return instruction.parse(tokenize_instruction(spec))
def parse(input):
period = sometok('period')
string = p.oneplus(sometok('string')) >> (lambda x: ' '.join(x))
number = sometok('number')
title = string + p.skip(period) >> RecipeTitle
ingredients_start = sometok('ingredients_start') + p.skip(period) >> IngredientStart
dry_measure = p.maybe(sometok('measure_type')) + sometok('dry_measure')
liquid_measure = sometok('liquid_measure')
mix_measure = sometok('mix_measure')
# is this valid ? 'g of butter', unit w/o initial_value
ingredient = (p.maybe(number)
+ p.maybe(dry_measure
| liquid_measure
| mix_measure)
+ string >> unarg(Ingredient)
)
ingredients = p.many(ingredient)
cooking_time = (p.skip(sometok('cooking_time'))
+ (number
>> unarg(CookingTime))
+ p.skip(sometok('period'))
)
oven_temp = (p.skip(sometok('oven'))
+ p.many(number)
+ p.skip(sometok('oven_temp'))
>> unarg(Oven)
)
method_start = sometok('method_start') + p.skip(period)
comment = p.skip(p.many(string|period))
header = title + p.maybe(comment)
instruction = (string
+ p.skip(period)
) >> parse_instruction
instructions = p.many(instruction)
program = (method_start + instructions) >> unarg(MethodStart)
serves = (sometok('serve') + number >> (lambda x: Serve('serve', x[1])) ) + p.skip(period)
ingredients_section = (ingredients_start + ingredients) >> unarg(IngredientSection)
recipe = ( header
+ p.maybe(ingredients_section)
+ p.maybe(cooking_time)
+ p.maybe(oven_temp)
+ p.maybe(program)
+ p.maybe(serves)
) >> RecipeNode
main_parser = p.oneplus(recipe)
return main_parser.parse(tokenize(input))
Running the script fails:
ImportError: cannot import name 'Spec'
The version of funcparserlib.lexer that I have is:
#Snipped some licence. Hint it's MIT.
__all__ = ['make_tokenizer', 'Token', 'LexerError']
import re
class LexerError(Exception):
def __init__(self, place, msg):
self.place = place
self.msg = msg
def __str__(self):
s = 'cannot tokenize data'
line, pos = self.place
return '%s: %d,%d: "%s"' % (s, line, pos, self.msg)
class Token(object):
def __init__(self, type, value, start=None, end=None):
self.type = type
self.value = value
self.start = start
self.end = end
def __repr__(self):
return 'Token(%r, %r)' % (self.type, self.value)
def __eq__(self, other):
# FIXME: Case sensitivity is assumed here
return self.type == other.type and self.value == other.value
def _pos_str(self):
if self.start is None or self.end is None:
return ''
else:
sl, sp = self.start
el, ep = self.end
return '%d,%d-%d,%d:' % (sl, sp, el, ep)
def __str__(self):
s = "%s %s '%s'" % (self._pos_str(), self.type, self.value)
return s.strip()
#property
def name(self):
return self.value
def pformat(self):
return "%s %s '%s'" % (self._pos_str().ljust(20),
self.type.ljust(14),
self.value)
def make_tokenizer(specs):
"""[(str, (str, int?))] -> (str -> Iterable(Token))"""
def compile_spec(spec):
name, args = spec
return name, re.compile(*args)
compiled = [compile_spec(s) for s in specs]
def match_specs(specs, str, i, position):
line, pos = position
for type, regexp in specs:
m = regexp.match(str, i)
if m is not None:
value = m.group()
nls = value.count('\n')
n_line = line + nls
if nls == 0:
n_pos = pos + len(value)
else:
n_pos = len(value) - value.rfind('\n') - 1
return Token(type, value, (line, pos + 1), (n_line, n_pos))
else:
errline = str.splitlines()[line - 1]
raise LexerError((line, pos + 1), errline)
def f(str):
length = len(str)
line, pos = 1, 0
i = 0
while i < length:
t = match_specs(compiled, str, i, (line, pos))
yield t
line, pos = t.end
i += len(t.value)
return f
# This is an example of a token spec. See also [this article][1] for a
# discussion of searching for multiline comments using regexps
# (including `*?`).
#
# [1]: http://ostermiller.org/findcomment.html
_example_token_specs = [
('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)),
('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)),
('COMMENT', (r'//.*',)),
('NL', (r'[\r\n]+',)),
('SPACE', (r'[ \t\r\n]+',)),
('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),
('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)),
('INT', (r'[0-9]+',)),
('INT', (r'\$[0-9A-Fa-f]+',)),
('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/#\^]',)),
('STRING', (r"'([^']|(''))*'",)),
('CHAR', (r'#[0-9]+',)),
('CHAR', (r'#\$[0-9A-Fa-f]+',)),
]
#tokenize = make_tokenizer(_example_token_specs)
And I can sure see why it can't import Spec! There's no Spec there! What's the best way to go about this, guys? Is there a simple "find-replace" that I can do to move forward with this project? Drudging through the repos I could find online (and there are confusing several) wasn't much help to me, but maybe I missed something.
You don't need the Specs class, in the current version of the funcparserlib you just have to declare a list of tuples, if you need to set up tokenizer.
See the example in the lexer module:
_example_token_specs = [
('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)),
('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)),
('COMMENT', (r'//.*',)),
('NL', (r'[\r\n]+',)),
('SPACE', (r'[ \t\r\n]+',)),
('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)),
('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)),
('INT', (r'[0-9]+',)),
('INT', (r'\$[0-9A-Fa-f]+',)),
('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/#\^]',)),
('STRING', (r"'([^']|(''))*'",)),
('CHAR', (r'#[0-9]+',)),
('CHAR', (r'#\$[0-9A-Fa-f]+',)),
]
Specs class is out of date, according to source of the funcparserlib.
I am using the "awesomest" parsing library in the world existing right now. Pyparsing. The problem at hand is to generate a PyMongo dictionary from a given SQL string (For select statements). The grammar def I am using is following :
sql_stmt = (select_key_word + ('*' | column_list).setResultsName
("columns") + form_key_word + table_name_list.setResultsName
("collections") +
Optional(where_condition, "").setResultsName("where"))
Here the select_key_word, column_list etc. constructs are valid grammar defs. and using this i can parse a string like "Select * from collection_1 where (Sal = 1000 or Sal=5000) AND Car>2"
The problem i have is that, the where part is being parsed is like this :
[[u'where', [u'(', [u'Sal', '=', u'1000'], 'or', [u'Sal', '=', u'5000'], u')'], 'and', [u'Car', '>', u'2']]]
Which is fine if i want it translated into something sqlish. But a valid representation of that same in pymongo would be something like this :
{u'$or': [{u'$and': [{u'Sal': u'1000'}, {u'Sal': u'5000'}]}, {u'Car': {u'$gte': u'2'}}]}
That is where I am stuck. Can anybody give me a direction? it seems to me that setParseAction will be a way to go, but just can't figure that out
the code for the where_contidion is :
where_expr = Forward()
and_keyword = get_conjunction_as_grammar("and")
or_keyword = get_conjunction_as_grammar("or")
in_operation = get_operation_as_grammar("in")
column_value = get_real_number_as_grammar() | get_int_as_grammar() | \
quotedString
binary_operator = get_bin_op_as_grammar()
col_name = get_column_name_as_grammar()
where_condn = Group(
(col_name + binary_operator + column_value) |
(col_name + in_operation + "(" + delimitedList(column_value) + ")" ) |
("(" + where_expr + ")")
)
where_expr << where_condn + ZeroOrMore((and_keyword | or_keyword)
+ where_expr)
where_condition = Group(CaselessLiteral("where") + where_expr)
Thanks in advance. Please let me know if you need any other information.
Yes, parse actions are just the thing for this kind of project. Also, if you are trying to evaluate an expression that can have parenthetical nesting of operations of varying precedence, then operatorPrecedence is often a handy shortcut:
from pyparsing import *
and_keyword = CaselessKeyword("and")
or_keyword = CaselessKeyword("or")
in_operation = CaselessKeyword("in")
value = quotedString | Word(alphanums)
comparisonOp = oneOf("= != > < >= <=")
LPAR,RPAR = map(Suppress,"()")
valueList = LPAR + delimitedList(value) + RPAR
comparisonExpr = value + comparisonOp + value | value + in_operation + Group(valueList)
def makePymongoComparison(tokens):
v1,op,v2 = tokens
if op != 'in':
if op != '=':
op = {
"!=" : "$ne",
">" : "$gt",
"<" : "$lt",
">=" : "$gte",
"<=" : "$lte",
}[op]
v2 = "{'%s': '%s'}" % (op, v2)
return "{'%s': '%s'}" % (v1, v2)
else:
return "{'%s': {'$in': [%s]}}" % (v1, ','.join("'%s'"%v for v in v2))
comparisonExpr.setParseAction(makePymongoComparison)
def handleBinaryOp(op):
def pa(tokens):
return "{'$%s': %s}" % (op, ', '.join(tokens.asList()[0][::2]))
return pa
handleAnd = handleBinaryOp("and")
handleOr = handleBinaryOp("or")
whereOperand = comparisonExpr
where_expr = operatorPrecedence(whereOperand,
[
(and_keyword, 2, opAssoc.LEFT, handleAnd),
(or_keyword, 2, opAssoc.LEFT, handleOr),
])
where_condition = Group(CaselessLiteral("where") + where_expr)
print where_expr.parseString("(Sal = 1000 or Sal=5000) AND Car>2")[0]
print where_expr.parseString("(Sal = 1000 or Sal=5000) AND Car in (1,2,3)")[0]
prints:
{'$and': {'$or': {'Sal': '1000'}, {'Sal': '5000'}}, {'Car': '{'$gt': '2'}'}}
{'$and': {'$or': {'Sal': '1000'}, {'Sal': '5000'}}, {'Car': {'$in': ['1','2','3']}}}
Still needs a few tweaks, but I hope this gets you further along.