PyParsing: parse if not a keyword - python

I am trying to parse a file as follows:
testp.txt
title = Test Suite A;
timeout = 10000
exp_delay = 500;
log = TRUE;
sect
{
type = typeA;
name = "HelloWorld";
output_log = "c:\test\out.log";
};
sect
{
name = "GoodbyeAll";
type = typeB;
comm1_req = 0xDEADBEEF;
comm1_resp = (int, 1234366);
};
The file first contains a section with parameters and then some sects. I can parse a file containing just parameters and I can parse a file just containing sects but I can't parse both.
from pyparsing import *
from pathlib import Path
command_req = Word(alphanums)
command_resp = "(" + delimitedList(Word(alphanums)) + ")"
kW = Word(alphas+'_', alphanums+'_') | command_req | command_resp
keyName = ~Literal("sect") + Word(alphas+'_', alphanums+'_') + FollowedBy("=")
keyValue = dblQuotedString.setParseAction( removeQuotes ) | OneOrMore(kW,stopOn=LineEnd())
param = dictOf(keyName, Suppress("=")+keyValue+Optional(Suppress(";")))
node = Group(Literal("sect") + Literal("{") + OneOrMore(param) + Literal("};"))
final = OneOrMore(node) | OneOrMore(param)
param.setDebug()
p = Path(__file__).with_name("testp.txt")
with open(p) as f:
try:
x = final.parseFile(f, parseAll=True)
print(x)
print("...")
dx = x.asDict()
print(dx)
except ParseException as pe:
print(pe)
The issue I have is that param matches against sect so it expects a =. So I tried putting in ~Literal("sect") in keyName but that just leads to another error:
Exception raised:Found unwanted token, "sect", found '\n' (at char 188), (line:4, col:56)
Expected end of text, found 's' (at char 190), (line:6, col:1)
How do I get it use one parse method for sect and another (param) if not sect?
My final goal would be to have the whole lot in a Dict with the global params and sects included.
EDIT
Think I've figured it out:
This line...
final = OneOrMore(node) | OneOrMore(param)
...should be:
final = ZeroOrMore(param) + ZeroOrMore(node)
But I wonder if there is a more structured way (as I'd ultimately like a dict)?

Related

PLY Lex: ID could be anything

I have a following simple format:
BLOCK ID {
SUBBLOCK ID {
SUBSUBBLOCK ID {
SOME STATEMENTS;
};
};
};
I configured ply to work with this format. But the issue is that ID could be any string including "BLOCK", "SUBBLOCK", etc.
In the lexer I define ID as:
#TOKEN(r'[a-zA-Z_][a-zA-Z_0-9]*')
def t_ID(self, t):
t.type = self.keyword_map.get(t.value, "ID")
return t
But it means that BLOCK word will not be allowed as a block name.
How I can overcome this issue?
The easiest solution is to create a non-terminal name to be used instead of ID in productions which need a name, such as block : BLOCK name braced_statements:
# Docstring is added later
def p_name(self, p):
p[0] = p[1]
Then you compute the productions for name and assign them to p_name's docstring by executing this before you generate the parser:
Parser.p_name.__doc__ = '\n| '.join(
['name : ID']
+ list(Lexer.keyword_map.values())
)
I probably dont understand your question
but i think something like the following would work (its been a long time since
I messed with PLY so keep in mind this is pseudocode
FUNCTION_ARGS = zeroOrMore(lazy('STATEMENT'),sep=',')
FUNCTION_CALL = t_ID + lparen + FUNCTION_ARGS + rparen
STATEMENT= FUNCTION_CALL | t_Literal | t_ID
SUBBLOCK = Literal('SUBBLOCK') + t_ID + lbrace + STATEMENT + rbrace
BLOCK = Literal('BLOCK') + lbrace + oneOrMore(SUBBLOCK) + rbrace

Dealing with ZeroOrMore in pyparsing

I'm trying to parse pactl list with pyparsing: So far all parse is working correctly but I cannot make ZeroOrMore to work correctly.
I can find foo: or foo: bar and try to deal with that with ZeroOrMore but it doesn't work, I have to add special case "Argument:" to find results without value, but there're Argument: foo results (with value) so it will not work, and I expect any other property to exist without value.
With this definition, and a fixed pactl list output:
#!/usr/bin/env python
#
# parsing pactl list
#
from pyparsing import *
import os
from subprocess import check_output
import sys
data = '''
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
'''
indentStack = [1]
stmt = Forward()
identifier = Word(alphanums+"-_.")
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums)))
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1)))))
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop = (prop_name + prop_section)
stmt << ( section | prop | ("Argument:") | value | prop_val )
syntax = OneOrMore(stmt)
parseTree = syntax.parseString(data)
parseTree.pprint()
This gets:
$ ./pactl.py
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
[[['Module'], ['6']],
[['Argument:'],
[[['Name'], ['module-alsa-card']]],
[[['Usage counter'], ['0']]],
['Properties:',
[[[['module.author'], ['"Lennart Poettering"']]],
[[['module.description'], ['"ALSA Card"']]],
[[['module.version'], ['"14.0-rebootstrapped"']]]]]]]
So far so good, but removing special case for Argument: it gets into error, as ZeroOrMore doesn't behave as expected:
#!/usr/bin/env python
#
# parsing pactl list
#
from pyparsing import *
import os
from subprocess import check_output
import sys
data = '''
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
'''
indentStack = [1]
stmt = Forward()
identifier = Word(alphanums+"-_.")
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums)))
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1))))).setDebug()
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop = (prop_name + prop_section)
stmt << ( section | prop | value | prop_val )
syntax = OneOrMore(stmt)
parseTree = syntax.parseString(data)
parseTree.pprint()
This results in:
$ ./pactl.py
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
Match Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) at loc 19(3,9)
Matched Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) -> [[['Argument'], ['Name']]]
Match Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) at loc 1(2,1)
Exception raised:Expected ":", found '#' (at char 8), (line:2, col:8)
Traceback (most recent call last):
File "/home/alberto/projects/node/pacmd_list_json/./pactl.py", line 55, in <module>
parseTree = syntax.parseString(partial)
File "/usr/local/lib/python3.9/site-packages/pyparsing.py", line 1955, in parseString
raise exc
File "/usr/local/lib/python3.9/site-packages/pyparsing.py", line 6336, in checkUnindent
raise ParseException(s, l, "not an unindent")
pyparsing.ParseException: Expected {{Group:({Group:(W:(ABCD...)) Suppress:("#") Group:(W:(0123...))}) indented block} | {"Properties:" indented block} | Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) | Group:({Group:(W:(ABCD...)) Suppress:("=") Group:(Combine:({{W:(ABCD...) | <SP><TAB>}}...))})}, found ':' (at char 41), (line:4, col:13)
See from setDebug value grammar ZeroOrMore is getting the tokens from next line [[['Argument'], ['Name']]]
I tried LineEnd() and other tricks but none works.
Any idea on how to deal with ZeroOrMore to stop on LineEnd() or without special cases?
NOTE: Real output can be retrieved using:
env = os.environ.copy()
env['LANG'] = 'C'
data = check_output(
['pactl', 'list'], universal_newlines=True, env=env)
indentedBlock is not the easiest pyparsing element to work with. But there are a few things that you are doing that are getting in your way.
To debug this, I broke down some of your more complex expressions, use setName() to give them names, and then added .setDebug(). Like this:
identifier = Word(alphas, alphanums+"-_.").setName("identifier").setDebug()
This will tell pyparsing to output a message whenever this expression is about to be matched, if it matched successfully, or if not, the exception that was raised.
Match identifier at loc 1(2,1)
Matched identifier -> ['Module']
Match identifier at loc 15(3,5)
Matched identifier -> ['Argument']
Match identifier at loc 15(3,5)
Matched identifier -> ['Argument']
Match identifier at loc 23(3,13)
Exception raised:Expected identifier, found ':' (at char 23), (line:3, col:13)
It looks like these expressions are messing up the indentedBlock matching, by processing whitespace that should be indentation space:
Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))
The " character in the Word and the whitespace lead me to believe you are trying to match quoted strings. I replaced this expression with:
Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString))
You also need to take care not to read past the end of the line, or you'll also mess up the indentedBlock indentation tracking. I added this expression for a newline at the top:
NL = LineEnd()
and then used it as the stopOn argument to OneOrMore and ZeroOrMore:
prop_val_value = Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString(), stopOn=NL)).setName("prop_val_value")#.setDebug()
prop_val = Group(identifier + Suppress("=") + Group(prop_val_value)).setName("prop_val")#.setDebug()
Here is the parser I ended up with:
indentStack = [1]
stmt = Forward()
NL = LineEnd()
identifier = Word(alphas, alphanums+"-_.").setName("identifier").setDebug()
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums))).setName("sect_def")#.setDebug()
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
#~ value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1))))).setDebug()
value_label = originalTextFor(OneOrMore(identifier)).setName("value_label")#.setDebug()
value = Group(value_label
+ Suppress(":")
+ Optional(~NL + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_.') | quotedString(), stopOn=NL))))).setName("value")#.setDebug()
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
#~ prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop_val_value = Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString(), stopOn=NL)).setName("prop_val_value")#.setDebug()
prop_val = Group(identifier + Suppress("=") + Group(prop_val_value)).setName("prop_val")#.setDebug()
prop = (prop_name + prop_section).setName("prop")#.setDebug()
stmt << ( section | prop | value | prop_val )
Which gives this:
[[['Module'], ['6']],
[[['Argument']],
[['Name', ['module-alsa-card']]],
[['Usage counter', ['0']]],
['Properties:',
[[['module.author', ['"Lennart Poettering"']]],
[['module.description', ['"ALSA Card"']]],
[['module.version', ['"14.0-rebootstrapped"']]]]]]]

Convert lvm.conf to python dict using pyparsing

I'm trying to convert lvm.conf to python (JSON like) object.
LVM (Logical Volume Management) configuration file looks like this:
# Configuration section config.
# How LVM configuration settings are handled.
config {
# Configuration option config/checks.
# If enabled, any LVM configuration mismatch is reported.
# This implies checking that the configuration key is understood by
# LVM and that the value of the key is the proper type. If disabled,
# any configuration mismatch is ignored and the default value is used
# without any warning (a message about the configuration key not being
# found is issued in verbose mode only).
checks = 1
# Configuration option config/abort_on_errors.
# Abort the LVM process if a configuration mismatch is found.
abort_on_errors = 0
# Configuration option config/profile_dir.
# Directory where LVM looks for configuration profiles.
profile_dir = "/etc/lvm/profile"
}
local {
}
log {
verbose=0
silent=0
syslog=1
overwrite=0
level=0
indent=1
command_names=0
prefix=" "
activation=0
debug_classes=["memory","devices","activation","allocation","lvmetad","metadata","cache","locking","lvmpolld","dbus"]
}
I'd like to get Python dict, like this:
{ "section_name"":
{"value1" : 1,
"value2" : "some_string",
"value3" : [list, of, strings]}... and so on.}
The parser function:
def parseLvmConfig2(path="/etc/lvm/lvm.conf"):
try:
EQ, LBRACE, RBRACE, LQ, RQ = map(pp.Suppress, "={}[]")
comment = pp.Suppress("#") + pp.Suppress(pp.restOfLine)
configSection = pp.Word(pp.alphas + "_") + LBRACE
sectionKey = pp.Word(pp.alphas + "_")
sectionValue = pp.Forward()
entry = pp.Group(sectionKey + EQ + sectionValue)
real = pp.Regex(r"[+-]?\d+\.\d*").setParseAction(lambda x: float(x[0]))
integer = pp.Regex(r"[+-]?\d+").setParseAction(lambda x: int(x[0]))
listval = pp.Regex(r'(?:\[)(.*)?(?:\])').setParseAction(lambda x: eval(x[0]))
pp.dblQuotedString.setParseAction(pp.removeQuotes)
struct = pp.Group(pp.ZeroOrMore(entry) + RBRACE)
sectionValue << (pp.dblQuotedString | real | integer | listval)
parser = pp.ZeroOrMore(configSection + pp.Dict(struct))
res = parser.parseFile(path)
print(res)
except (pp.ParseBaseException, ) as e:
print("lvm.conf bad format {0}".format(e))
The result is messy and the question is, how to make pyparsing do the job, without additional logic?
UPDATE(SOLVED):
For anyone who wants to understand pyparsing better, please check #PaulMcG explanation below. (Thanks for pyparsing, Paul! )
import pyparsing as pp
def parseLvmConf(conf="/etc/lvm/lvm.conf", res_type="dict"):
EQ, LBRACE, RBRACE, LQ, RQ = map(pp.Suppress, "={}[]")
comment = "#" + pp.restOfLine
integer = pp.nums
real = pp.Word(pp.nums + "." + pp.nums)
pp.dblQuotedString.setParseAction(pp.removeQuotes)
scalar_value = real | integer | pp.dblQuotedString
list_value = pp.Group(LQ + pp.delimitedList(scalar_value) + RQ)
key = pp.Word(pp.alphas + "_", pp.alphanums + '_')
key_value = pp.Group(key + EQ + (scalar_value | list_value))
struct = pp.Forward()
entry = key_value | pp.Group(key + struct)
struct <<= pp.Dict(LBRACE + pp.ZeroOrMore(entry) + RBRACE)
parser = pp.Dict(pp.ZeroOrMore(entry))
parser.ignore(comment)
try:
#return lvm.conf as dict
if res_type == "dict":
return parser.parseFile(conf).asDict()
# return lvm.conf as list
elif res_type == "list":
return parser.parseFile(conf).asList()
else:
#return lvm.conf as ParseResults
return parser.parseFile(conf)
except (pp.ParseBaseException,) as e:
print("lvm.conf bad format {0}".format(e))
Step 1 should always be to at least rough out a BNF for the format you are going to parse. This really helps organize your thoughts, and gets you thinking about the structure and data you are parsing, before starting to write actual code.
Here is a BNF that I came up with for this config (it looks like a Python string because that makes it easy to paste into your code for future reference - but pyparsing does not work with or require such strings, they are purely a design tool):
BNF = '''
key_struct ::= key struct
struct ::= '{' (key_value | key_struct)... '}'
key_value ::= key '=' (scalar_value | list_value)
key ::= word composed of alphas and '_'
list_value ::= '[' scalar_value [',' scalar_value]... ']'
scalar_value ::= real | integer | double-quoted-string
comment ::= '#' rest-of-line
'''
Notice that the opening and closing {}'s and []'s are at the same level, rather than having an opener in one expression and a closer in another.
This BNF also will allow for structs nested within structs, which is not strictly required in the sample text you posted, but since your code looked to be supporting that, I included it.
Translating to pyparsing is pretty straightforward from here, working bottom-up through the BNF:
EQ, LBRACE, RBRACE, LQ, RQ = map(pp.Suppress, "={}[]")
comment = "#" + pp.restOfLine
integer = ppc.integer #pp.Regex(r"[+-]?\d+").setParseAction(lambda x: int(x[0]))
real = ppc.real #pp.Regex(r"[+-]?\d+\.\d*").setParseAction(lambda x: float(x[0]))
pp.dblQuotedString.setParseAction(pp.removeQuotes)
scalar_value = real | integer | pp.dblQuotedString
# `delimitedList(expr)` is a shortcut for `expr + ZeroOrMore(',' + expr)`
list_value = pp.Group(LQ + pp.delimitedList(scalar_value) + RQ)
key = pp.Word(pp.alphas + "_", pp.alphanums + '_')
key_value = pp.Group(key + EQ + (scalar_value | list_value))
struct = pp.Forward()
entry = key_value | pp.Group(key + struct)
struct <<= (LBRACE + pp.ZeroOrMore(entry) + RBRACE)
parser = pp.ZeroOrMore(entry)
parser.ignore(comment)
Running this code:
try:
res = parser.parseString(lvm_source)
# print(res.dump())
res.pprint()
return res
except (pp.ParseBaseException, ) as e:
print("lvm.conf bad format {0}".format(e))
Gives this nested list:
[['config',
['checks', 1],
['abort_on_errors', 0],
['profile_dir', '/etc/lvm/profile']],
['local'],
['log',
['verbose', 0],
['silent', 0],
['syslog', 1],
['overwrite', 0],
['level', 0],
['indent', 1],
['command_names', 0],
['prefix', ' '],
['activation', 0],
['debug_classes',
['memory',
'devices',
'activation',
'allocation',
'lvmetad',
'metadata',
'cache',
'locking',
'lvmpolld',
'dbus']]]]
I think the format you would prefer is one where you can access the values as keys in a nested dict or in a hierarchical object. Pyparsing has a class called Dict that will do this at parse time, so that the results names are automatically assigned for nested subgroups. Change these two lines to have their sub-entries automatically dict-ified:
struct <<= pp.Dict(LBRACE + pp.ZeroOrMore(entry) + RBRACE)
parser = pp.Dict(pp.ZeroOrMore(entry))
Now if we call dump() instead of pprint(), we'll see the hierarchical naming:
[['config', ['checks', 1], ['abort_on_errors', 0], ['profile_dir', '/etc/lvm/profile']], ['local'], ['log', ['verbose', 0], ['silent', 0], ['syslog', 1], ['overwrite', 0], ['level', 0], ['indent', 1], ['command_names', 0], ['prefix', ' '], ['activation', 0], ['debug_classes', ['memory', 'devices', 'activation', 'allocation', 'lvmetad', 'metadata', 'cache', 'locking', 'lvmpolld', 'dbus']]]]
- config: [['checks', 1], ['abort_on_errors', 0], ['profile_dir', '/etc/lvm/profile']]
- abort_on_errors: 0
- checks: 1
- profile_dir: '/etc/lvm/profile'
- local: ''
- log: [['verbose', 0], ['silent', 0], ['syslog', 1], ['overwrite', 0], ['level', 0], ['indent', 1], ['command_names', 0], ['prefix', ' '], ['activation', 0], ['debug_classes', ['memory', 'devices', 'activation', 'allocation', 'lvmetad', 'metadata', 'cache', 'locking', 'lvmpolld', 'dbus']]]
- activation: 0
- command_names: 0
- debug_classes: ['memory', 'devices', 'activation', 'allocation', 'lvmetad', 'metadata', 'cache', 'locking', 'lvmpolld', 'dbus']
- indent: 1
- level: 0
- overwrite: 0
- prefix: ' '
- silent: 0
- syslog: 1
- verbose: 0
You can then access the fields as res['config']['checks'] or res.log.indent.

Convert a file to static C string declaration

I would like convert number of files to static string declarations in C. I have trying writing a quick script in Python (shown below), but it doesn't seem exactly simple and a number of issue came up trying to compile the output.
import os, sys
from glob import glob
from re import sub
test_dirs = ('basics', 'float', 'import', 'io', 'misc')
tests = sorted(test_file for test_files in (glob('{}/*.py'.format(dir)) for dir in test_dirs) for test_file in test_files)
def cfunc_name(t):
return sub(r'/|\.|-', '_', t)
for t in tests:
print("void {}(void* data) {{".format(cfunc_name(t)))
with open(t) as f:
lines = ''.join(f.readlines())
cstr = sub('"', '\\"', lines)
cstr = sub('\n', '\"\n\"', cstr)
print(" const char * pystr = \"\"\n\"{}\";".format(cstr))
print("end:\n ;\n}")
print("struct testcase_t core_tests[] = {")
for t in tests:
print(" {{ \"{}\", test_{}_fn, TT_ENABLED_, 0, 0 }},".format(t, cfunc_name(t)))
print("END_OF_TESTCASES };")
Looking for an existing tool is not exactly obvious (may be my search keywords are not quite right)... Is there a simple UNIX tool that does this or has anyone come across something similar?
Does this work for you? https://code.google.com/p/txt2cs/
The main issue I can think of is new lines and escaping if you want to roll your own.
I have used the txt2cs implementation as a reference and ended-up with just a few lines of Python that do all the escaping. As I didn't want to add extra things to the build system, it's easier to have this done in Python. This is going to be integrated in test automation, which is already a complex beast.
The main takeaway is that RE substitutions have to be done in a certain order and aren't the ideal tool for this purpose.
import os, sys
from glob import glob
from re import sub
def escape(s):
lookup = {
'\0': '\\0',
'\t': '\\t',
'\n': '\\n\"\n\"',
'\r': '\\r',
'\\': '\\\\',
'\"': '\\\"',
}
return "\"\"\n\"{}\"".format(''.join([lookup[x] if x in lookup else x for x in s]))
def chew_filename(t):
return { 'func': "test_{}_fn".format(sub(r'/|\.|-', '_', t)), 'desc': t.split('/')[1] }
def script_to_map(t):
r = { 'name': chew_filename(t)['func'] }
with open(t) as f: r['script'] = escape(''.join(f.readlines()))
return r
test_function = (
"void {name}(void* data) {{\n"
" const char * pystr = {script};\n"
" do_str(pystr);\n"
"}}"
)
testcase_struct = (
"struct testcase_t {name}_tests[] = {{\n{body}\n END_OF_TESTCASES\n}};"
)
testcase_member = (
" {{ \"{desc}\", {func}, TT_ENABLED_, 0, 0 }},"
)
testgroup_struct = (
"struct testgroup_t groups[] = {{\n{body}\n END_OF_GROUPS\n}};"
)
testgroup_member = (
" {{ \"{name}/\", {name}_tests }},"
)
test_dirs = ('basics', 'float', 'import', 'io', 'misc')
output = []
for group in test_dirs:
tests = glob('{}/*.py'.format(group))
output.extend([test_function.format(**script_to_map(test)) for test in tests])
testcase_members = [testcase_member.format(**chew_filename(test)) for test in tests]
output.append(testcase_struct.format(name=group, body='\n'.join(testcase_members)))
testgroup_members = [testgroup_member.format(name=group) for group in test_dirs]
output.append(testgroup_struct.format(body='\n'.join(testgroup_members)))
print('\n\n'.join(output))
Below is what the output looks like, as you can see the initial ""\n and '\\n\"\n\"' make it quite a bit more readable:
void test_basics_break_py_fn(void* data) {
const char * pystr = ""
"while True:\n"
" break\n"
"\n"
"for i in range(4):\n"
" print('one', i)\n"
" if i > 2:\n"
" break\n"
" print('two', i)\n"
"\n"
"for i in [1, 2, 3, 4]:\n"
" if i == 3:\n"
" break\n"
" print(i)\n"
"";
do_str(pystr);
}

Parsing significant whitespace with a PEG in Python (specificly Parsley)

I'm creating a syntax that supports significant whitespace (most like the "Z" lisp variant than Python or yaml, but same idea)
I came across this article on how to do significant whitespace parsing in a pegasus a PEG parser for C#
But I've been less than successful at converting that to parsley, looks like the #STATE# variable in Pegasus follows backtracking in some way.
This is the closest I've gotten to a simple parser, If I use the version of indent with look ahead it can't parse children, and if I use the version without, it can't parse siblings.
If this is a limitation of parsley and I need to use PyPEG or Parsimonious or something, I'm open to that, but it seems like if the internal indent variable could follow the PEGs internal backtracking this would all work.
import parsley
def indent(s):
s['i'] += 2
print('indent i=%d' % s['i'])
def deindent(s):
s['i'] -= 2
print('deindent i=%d' % s['i'])
grammar = parsley.makeGrammar(r'''
id = <letterOrDigit+>
eol = '\n' | end
nots = anything:x ?(x != ' ')
node = I:i id:name eol !(fn_print(_state['i'], name)) -> i, name
#I = !(' ' * _state['i'])
I = (' '*):spaces ?(len(spaces) == _state['i'])
#indent = ~~(!(' ' * (_state['i'] + 2)) nots) -> fn_indent(_state)
#deindent = ~~(!(' ' * (_state['i'] - 2)) nots) -> fn_deindent(_state)
indent = -> fn_indent(_state)
deindent = -> fn_deindent(_state)
child_list = indent (ntree+):children deindent -> children
ntree = node:parent (child_list?):children -> parent, children
nodes = ntree+
''', {
'_state': {'i': 0},
'fn_indent': indent,
'fn_deindent': deindent,
'fn_print': print,
})
test_string = '\n'.join((
'brother',
' brochild1',
#' gchild1',
#' brochild2',
#' grandchild',
'sister',
#' sischild',
#'brother2',
))
nodes = grammar(test_string).nodes()

Categories

Resources