Regex: Match string (including nested characters) inside 2 characters) while not capturing characters, but non-greedy - python

I'm done making a markup language, but I'm now making optimized strings for it.
I want there to be nested characters allowed inside the strings, with my RPLY lexer there are 3 objects I need to allow inside of strings (curly braces and backticks), as well as not capturing the characters, such as the regex code I've tried:
(?:`)+([\w\W]+)(?:`)+
But this is greedy, and will only match between the first grave it sees and the last, as well as creating non-captured groups that are unsupported with RPLY.
Is there an alternative to this that's non-greedy but will allow nested characters? (and no non-capturing groups please, I'm using the the RPython distribution of PLY (RPLY) lexer and parser which doesn't support regex groups)
If anybody needs more code, I have 2 Python classes, both for the lexer and parser.
LEXER
from rply import LexerGenerator
class BMLLexer():
def __init__(self):
self.__lexer = LexerGenerator()
def __add_tokens(self):
# Statement definitions
# Note that I need both the OPEN_STATEMENT and CLOSE_STATEMENT to be allowed inside the string.
self.__lexer.add('OPEN_STATEMENT', r'\{')
self.__lexer.add('CLOSE_STATEMENT', r'\}')
# Basic things
# Note that RPLY's parser doesn't allow multiple groups (includes capturing and non-capturing), so the only option would be just to use functions to remove the first and last backtick from the string, or find something in regex that allows me to automatically get rid of the first and last backtick in the string.
self.__lexer.add('STRING', r'(?:`)+([\w\W]+)(?:`)+')
# Ignore spaces
self.__lexer.ignore('\s+')
def build(self):
self.__add_tokens()
return self.__lexer.build()
PARSER
import re
from bmls.language.parser.definitions import BMLDefinitionCapped, BMLDefinitionSingle
from rply import ParserGenerator, Token
class BMLParser():
"""The direct BML parser.
Raises:
SyntaxError: If given invalid syntax, the parser will throw a SyntaxError.
"""
# Init parser
def __init__(self):
"""Initializes the parser.
"""
self.pg = ParserGenerator(
# A list of all token names accepted by the parser.
[
'OPEN_STATEMENT',
'CLOSE_STATEMENT',
'STRING',
],
precedence= [
('left', ['OPEN_STATEMENT']),
('left', ['STRING']),
('right', ['CLOSE_STATEMENT'])
]
)
# Parsing
def parse(self):
"""Parses BML content
Raises:
SyntaxError: If given invalid syntax, the parser will throw a SyntaxError.
Returns:
list: An HTML/XML formatted list of items.
"""
# Multi-expression handling
#self.pg.production('main : expr')
#self.pg.production('main : main expr')
def main(p):
if len(p) == 1:
return p
else:
for x in p[1:]:
p[0].append(x)
return p[0]
# Expression handling
#self.pg.production('expr : STRING OPEN_STATEMENT main CLOSE_STATEMENT')
def definition_capped(p):
name = self.__toSTRING(p[0])
definition1 = self.__toSTRING(p[2])
comp = BMLDefinitionCapped(name, definition1)
return self.__toHTML(comp)
#self.pg.production('expr : OPEN_STATEMENT STRING CLOSE_STATEMENT')
def definition_uncapped(p):
name = self.__toSTRING(p[1])
comp = BMLDefinitionSingle(name)
return self.__toHTML(comp)
# Expression types
# This is where the string is parsed, currently using the function __removeFIrstLast. I wish to replace this with a supported expression.
#self.pg.production('expr : STRING')
def string_expr(p):
if p[0].gettokentype() == 'STRING':
return self.__removeFirstLast(self.__toSTRING(p[0]), '`', '`')
# Error handling
#self.pg.error
def error_handle(token):
raise SyntaxError('Error on Token (\'' + token.gettokentype() + '\' , \'' + token.getstr() + '\')')
# Public utilities
def build(self):
return self.pg.build()
# Private utilities
def __removeFirstLast(self, tok, char, endchar):
if isinstance(tok, str):
if tok.startswith(char) and tok.endswith(endchar):
return re.sub(r'^' + char + r'|' + endchar + r'$', '', tok)
else:
return tok
else:
return tok
def __toHTML(self, tok):
output = ''
if isinstance(tok, BMLDefinitionCapped):
right = ''
try:
for k1 in tok.right:
right += k1
except:
right += tok.right
output += '<' + tok.left + '>' + right + '</' + tok.left.split(' ')[0] + '>'
elif isinstance(tok, BMLDefinitionSingle):
output += '<' + tok.left + '>'
elif isinstance(tok, Token):
output += tok.getstr()
else:
output += tok
return output
def __toSTRING(self, tok):
if isinstance(tok, Token):
return tok.getstr()
else:
return tok
def __toINT(self, tok):
if isinstance(tok, Token):
return int(tok.getstr())
else:
return int(tok)

Related

python ply parsing to parse function name and the parameter passed to it unambiguously

from ply import lex, yacc
import itertools
reserved = {
'(?i)fun1' : 'FUN1',
'(?i)fun2' : 'FUN2'
}
tokens = [
'PARAM',
'FUNC_NAME',
'LP',
'RP'
]
tokens += reserved.values()
t_PARAM = r'[^\s\(\),&:\"\'~]+'
def t_LP(t):
r'\('
return t
def t_RP(t):
r'\)'
return t
def t_FUNC_NAME(t):
r'[a-zA-Z][a-zA-Z0-9.]*'
if t.value in reserved:
t.type = reserved[ t.value ]
return t
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
t_ignore = ' \t'
def t_error(t):
print("Illegal character '%s' on line %d, column %d" % (t.value[0],
t.lexer.lineno, t.lexer.lexpos))
t.lexer.skip(1)
def build_lexer():
lexer = lex.lex()
query = "fun1(brownfox)"
lexer.input(query)
while True:
tok = lexer.token()
if not tok:
break
print(tok)
build_lexer()
I will have grammar like (yacc grammar rules looks like below):
expression : FUNC_NAME(PARAM)
expression : PARAM
Queries to be parsed for eg are:
1.) FUN1(parameter1)
it should be parsed like below:
FUN1 -> FUNC_NAME (as its a reserved word)
parameter1 -> PARAM
2.) FUN2(parameter2)
it should be parsed like below:
FUN2 -> FUNC_NAME (as its a reserved word)
parameter2 -> PARAM
3.) xyz
it should be parsed like below:
xyz -> PARAM
4.) fun3(xyz)
it should be parsed like below:
fun3 -> FUNC_NAME (not a reserved word but its in the form of func_name(param))
xyz -> PARAM
5.) fun4(xyz)
it should be parsed like below:
fun4 -> FUNC_NAME (not a reserved word but its in the form of func_name(param))
xyz -> PARAM
But here the problem is the parameter passed to the function name is also parsed as FUNC_NAME.
What should i do so that expressions that contains round opening and closing parenthesis should be parsed like function name as FUNC_NAME token and parameter as PARAM token. Please help.

Parsing command strings defined on lines with PLY

I'm new to the world of lexing and parsing so I hope this is an easy problem to solve. I'm trying to parse a file with groups of tokens that fall into different types with Python's PLY:
STRING STRING QUANTITY STRING STRING # TypeA
STRING STRING STRING STRING STRING STRING # TypeB
STRING STRING QUANTITY QUANTITY QUANTITY # TypeC
Each line is supposed to be one type of command that my program understands. For example, let's call the type defined in the top line TypeA, the second line TypeB, and so on. Since there's supposed to be one command per line, the NEWLINE token at the end of each line indicates the end of a command. I successfully managed to tokenize the file with the following lexer:
# top level tokens
tokens = [
'QUANTITY',
'STRING',
'NEWLINE'
]
# number, possibly in exponential notion, e.g. -1.5e-3.0, or SI suffix, e.g. 'k'
t_QUANTITY = r'[+-]?(\d+\.\d*|\d*\.\d+|\d+)([eE][+-]?\d*\.?\d*|[GMkmunpf])?'
# any group of 2 or more alphanumeric characters, with the first being a letter
t_STRING = r'[a-zA-Z_][a-zA-Z_0-9]*'
# ignore spaces and tabs
t_ignore = ' \t'
# ignore comments
t_ignore_COMMENT = r'\#.*'
# detect new lines
def t_newline(t):
r'\n+'
# generate newline token
t.type = "NEWLINE"
return t
I want to write a parser which will parse each matched command into different objects. I should end up with a list of the parsed objects.
I tried constructing the following rules:
def p_command(self, p):
'''command : tokens NEWLINE
| NEWLINE'''
print("found command:", list(p))
def p_tokens(self, p):
'''tokens : type_a_tokens
| type_b_tokens
| type_c_tokens'''
p[0] = p[1]
def p_type_a_tokens(self, p):
'''type_a_tokens : STRING STRING QUANTITY STRING STRING'''
p[0] = "TypeA"
def p_type_b_tokens(self, p):
'''type_b_tokens : STRING STRING STRING STRING STRING STRING'''
p[0] = "TypeB"
def p_type_c_tokens(self, p):
'''type_c_tokens : STRING STRING QUANTITY QUANTITY QUANTITY'''
p[0] = "TypeC"
I get a SyntaxError for the token immediately after the first NEWLINE. Somehow the parser doesn't know to begin parsing a new command after it sees a pattern matching that of p_type_a_tokens.
Please can anyone shed some light on what should be a pretty simple set of parsing rules? Although the documentation for PLY is generally very good, all of the examples I've found so far are for calculators or programming languages where things like newlines don't apply.
Full source:
from ply import lex, yacc
class InputParser(object):
# top level tokens
tokens = [
'QUANTITY',
'STRING',
'NEWLINE'
]
t_QUANTITY = r'[+-]?(\d+\.\d*|\d*\.\d+|\d+)([eE][+-]?\d*\.?\d*|[GMkmunpf])?'
t_STRING = r'[a-zA-Z_][a-zA-Z_0-9]*'
# ignored characters
t_ignore = ' \t'
# ignore comments
t_ignore_COMMENT = r'\#.*'
def __init__(self, **kwargs):
self.lexer = lex.lex(module=self, **kwargs)
self.parser = yacc.yacc(module=self, **kwargs)
# detect new lines
def t_newline(self, t):
r'\n+'
# generate newline token
t.type = "NEWLINE"
# error handling
def t_error(self, t):
# anything that gets past the other filters
print("Illegal character '%s' on line %i at position %i" %
(t.value[0], self.lexer.lineno))
# skip forward a character
t.lexer.skip(1)
# match commands on their own lines
def p_command(self, p):
'''command : tokens NEWLINE
| NEWLINE'''
print("found command:", list(p))
p[0] = p[1]
def p_tokens(self, p):
'''tokens : type_a_tokens
| type_b_tokens
| type_c_tokens'''
p[0] = p[1]
def p_type_a_tokens(self, p):
'''type_a_tokens : STRING STRING QUANTITY STRING STRING'''
print("found type a")
p[0] = "TypeA"
def p_type_b_tokens(self, p):
'''type_b_tokens : STRING STRING STRING STRING STRING STRING'''
print("found type b")
p[0] = "TypeB"
def p_type_c_tokens(self, p):
'''type_c_tokens : STRING STRING QUANTITY QUANTITY QUANTITY'''
print("found type c")
p[0] = "TypeC"
def p_error(self, p):
if p:
error_msg = "syntax error '%s'" % p.value
else:
error_msg = "syntax error at end of file"
print(error_msg)
def parse(self, text):
self.parser.parse(text, lexer=self.lexer)
if __name__ == "__main__":
parser = InputParser()
parser.parse("""
a b 5.5 c d # TypeA
e f 1.6 g h # TypeA
i j k l m n # TypeB
# empty line
o p -1 2.0 3e4 # TypeC
""")
The problem was caused by the fact that the first rule is special: this is where the parser starts. Since the first rule above cannot combine two commands (found on two adjacent lines), it fails.
I fixed it by adding a new root rule, above p_command, which can take either a single command (for when the file contains only one command) or a list of commands (command_list):
def p_command_list(self, p):
'''command_list : command
| command_list command'''
if len(p) == 3:
self.commands.append(p[2])
else:
self.commands.append(p[1])
(I also added a commands field to the class to hold the parsed commands)
This can handle multiple commands being "merged" together as is found in my input file.

Python equivalent of Fortran list-directed input

I'd like to be able to read data from an input file in Python, similar to the way that Fortran handles a list-directed read (i.e. read (file, *) char_var, float_var, int_var).
The tricky part is that the way Fortran handles a read statement like this is very "forgiving" as far as the input format is concerned. For example, using the previous statement, this:
"some string" 10.0, 5
would be read the same as:
"some string", 10.0
5
and this:
"other string", 15.0 /
is read the same as:
"other string"
15
/
with the value of int_var retaining the same value as before the read statement. And trickier still this:
"nother string", , 7
will assign the values to char_var and int_var but float_var retains the same value as before the read statement.
Is there an elegant way to implement this?
That is indeed tricky - I found it easier to write a pure-python stated-based tokenizer than think on a regular expression to parse each line (tough it is possible).
I've used the link provided by Vladimir as the spec - the tokenizer have some doctests that pass.
def tokenize(line, separator=',', whitespace="\t\n\x20", quote='"'):
"""
>>> tokenize('"some string" 10.0, 5')
['some string', '10.0', '5']
>>> tokenize(' "other string", 15.0 /')
['other string', '15.0', '/']
>>> tokenize('"nother string", , 7')
['nother string', '', '7']
"""
inside_str = False
token_started = False
token = ""
tokens = []
separated = False
just_added = False
for char in line:
if char in quote:
if not inside_str:
inside_str = True
else:
inside_str = False
tokens.append(token)
token = ""
just_added = True
continue
if char in (whitespace + separator) and not inside_str:
if token:
tokens.append(token)
token = ""
just_added = True
elif char in separator:
if not just_added:
tokens.append("")
just_added = False
continue
token += char
if token:
tokens.append(token)
return tokens
class Character(object):
def __init__(self, length=None):
self.length = length
def __call__(self, text):
if self.length is None:
return text
if len(text) > self.length:
return text[:self.length]
return "{{:{}}}".format(self.length).format(text)
def make_types(types, default_value):
return types, [default_value] * len[types]
def fortran_reader(file, types, default_char="/", default_value=None, **kw):
types, results = make_types(types, default_value)
tokens = []
while True:
tokens = []
while len(tokens) < len(results):
try:
line = next(file)
except StopIteration:
raise StopIteration
tokens += tokenize(line, **kw)
for i, (type_, token) in enumerate(zip(types, tokens)):
if not token or token in default_char:
continue
results[i] = type_(token)
changed_types = yield(results)
if changed_types:
types, results = make_types(changed_types)
I have not teste this thoughtfully - but for the tokenizer -
it is designed to work in a Python forstatement if the same fields are repeated over and over again - or it can be used with Python's iterators send method to change the values to be read on each iteration.
Please test, and e-mail me (address at my profile) some testing file. If there is indeed nothing similar, maybe this deserves some polishing and be published in Pypi.
Since I was not able to find a solution to this problem, I decided to write my own solution.
The main drivers are a reader class, and a tokenizer. The reader gets one line at a time from the file, passes it to the tokenizer, and assigns to the variables it is given, getting the next line as necessary.
class FortranAsciiReader(file):
def read(self, *args):
"""
Read from file into the given objects
"""
num_args = len(args)
num_read = 0
encountered_slash = False
# If line contained '/' or read into all varialbes, we're done
while num_read < num_args and not encountered_slash:
line = self.readline()
if not line:
raise Exception()
values = tokenize(line)
# Assign elements one-by-one into args, skipping empty fields and stopping at a '/'
for val in values:
if val == '/':
encountered_slash = True
break
elif val == '':
num_read += 1
else:
args[num_read].assign(val)
num_read += 1
if num_read == num_args:
break
The tokenizer splits the line into tokens in accordance with the way that Fortran performs list directed reads, where ',' and white space are separators, tokens may be "repeated" via 4*token, and a / terminates input.
My implementation of the tokenizer is a bit long to reproduce here, and I also included classes to transparently provide the functionality of the basic Fortran intrinsic types (i.e. Real, Character, Integer, etc.). The whole project can be found on my github account, currently at https://github.com/bprichar/PyLiDiRe. Thanks jsbueno for inspiration for the tokenizer.

Controlling Python PLY lexer states from parser

I am working on a simple SQL select like query parser and I need to be able to capture subqueries that can occur at certain places literally. I found lexer states are the best solution and was able to do a POC using curly braces to mark the start and end. However, the subqueries will be delimited by parenthesis, not curlys, and the parenthesis can occur at other places as well, so I can't being the state with every open-paren. This information is readily available with the parser, so I was hoping to call begin and end at appropriate locations in the parser rules. This however didn't work because lexer seem to tokenize the stream all at once, and so the tokens get generated in the INITIAL state. Is there a workaround for this problem? Here is an outline of what I tried to do:
def p_value_subquery(p):
"""
value : start_sub end_sub
"""
p[0] = "( " + p[1] + " )"
def p_start_sub(p):
"""
start_sub : OPAR
"""
start_subquery(p.lexer)
p[0] = p[1]
def p_end_sub(p):
"""
end_sub : CPAR
"""
subquery = end_subquery(p.lexer)
p[0] = subquery
The start_subquery() and end_subquery() are defined like this:
def start_subquery(lexer):
lexer.code_start = lexer.lexpos # Record the starting position
lexer.level = 1
lexer.begin('subquery')
def end_subquery(lexer):
value = lexer.lexdata[lexer.code_start:lexer.lexpos-1]
lexer.lineno += value.count('\n')
lexer.begin('INITIAL')
return value
The lexer tokens are simply there to detect the close-paren:
#lex.TOKEN(r"\(")
def t_subquery_SUBQST(t):
lexer.level += 1
#lex.TOKEN(r"\)")
def t_subquery_SUBQEN(t):
lexer.level -= 1
#lex.TOKEN(r".")
def t_subquery_anychar(t):
pass
I would appreciate any help.
This answer may only be partially helpful, but I would also suggest looking at section "6.11 Embedded Actions" of the PLY documentation (http://www.dabeaz.com/ply/ply.html). In a nutshell, it is possible to write grammar rules in which actions occur mid-rule. It would look something similar to this:
def p_somerule(p):
'''somerule : A B possible_sub_query LBRACE sub_query RBRACE'''
def p_possible_sub_query(p):
'''possible_sub_query :'''
...
# Check if the last token read was LBRACE. If so, flip lexer state
# Sadly, it doesn't seem that the token is easily accessible. Would have to hack it
if last_token == 'LBRACE':
p.lexer.begin('SUBQUERY')
Regarding the behavior of the lexer, there is only one token of lookahead being used. So, in any particular grammar rule, at most only one extra token has been read already. If you're going to flip lexer states, you need to make sure that it happens before the token gets consumed by the parser, but before the parser asks to read the next incoming token.
Also, if possible, I would try to stay out of the yacc() error handling stack as far as a solution. There is way too much black-magic going on in error handling--the more you can avoid it, the better.
I'm a bit pressed for time at the moment, but this seems to be something that could be investigated for the next version of PLY. Will put it on my to-do list.
Based on PLY author's response, I came up with this better solution. I am yet to figure out how to return the subquery as a token, but the rest looks much better and need not be considered a hack anymore.
def start_subquery(lexer):
lexer.code_start = lexer.lexpos # Record the starting position
lexer.level = 1
lexer.begin("subquery")
def end_subquery(lexer):
lexer.begin("INITIAL")
def get_subquery(lexer):
value = lexer.lexdata[lexer.code_start:lexer.code_end-1]
lexer.lineno += value.count('\n')
return value
#lex.TOKEN(r"\(")
def t_subquery_OPAR(t):
lexer.level += 1
#lex.TOKEN(r"\)")
def t_subquery_CPAR(t):
lexer.level -= 1
if lexer.level == 0:
lexer.code_end = lexer.lexpos # Record the ending position
return t
#lex.TOKEN(r".")
def t_subquery_anychar(t):
pass
def p_value_subquery(p):
"""
value : check_subquery_start OPAR check_subquery_end CPAR
"""
p[0] = "( " + get_subquery(p.lexer) + " )"
def p_check_subquery_start(p):
"""
check_subquery_start :
"""
# Here last_token would be yacc's lookahead.
if last_token.type == "OPAR":
start_subquery(p.lexer)
def p_check_subquery_end(p):
"""
check_subquery_end :
"""
# Here last_token would be yacc's lookahead.
if last_token.type == "CPAR":
end_subquery(p.lexer)
last_token = None
def p_error(p):
global subquery_retry_pos
if p is None:
print >> sys.stderr, "ERROR: unexpected end of query"
else:
print >> sys.stderr, "ERROR: Skipping unrecognized token", p.type, "("+ \
p.value+") at line:", p.lineno, "and column:", find_column(p.lexer.lexdata, p)
# Just discard the token and tell the parser it's okay.
yacc.errok()
def get_token():
global last_token
last_token = lexer.token()
return last_token
def parse_query(input, debug=0):
lexer.input(input)
return parser.parse(input, tokenfunc=get_token, debug=0)
Since nobody has an answer, it bugged me to find a workaround, and here is an ugly hack using the error recovery and restart().
def start_subquery(lexer, pos):
lexer.code_start = lexer.lexpos # Record the starting position
lexer.level = 1
lexer.begin("subquery")
lexer.lexpos = pos
def end_subquery(lexer):
value = lexer.lexdata[lexer.code_start:lexer.lexpos-1]
lexer.lineno += value.count('\n')
lexer.begin('INITIAL')
return value
#lex.TOKEN(r"\(")
def t_subquery_SUBQST(t):
lexer.level += 1
#lex.TOKEN(r"\)")
def t_subquery_SUBQEN(t):
lexer.level -= 1
if lexer.level == 0:
t.type = "SUBQUERY"
t.value = end_subquery(lexer)
return t
#lex.TOKEN(r".")
def t_subquery_anychar(t):
pass
# NOTE: Due to the nature of the ugly workaround, the CPAR gets dropped, which
# makes it look like there is an imbalance.
def p_value_subquery(p):
"""
value : OPAR SUBQUERY
"""
p[0] = "( " + p[2] + " )"
subquery_retry_pos = None
def p_error(p):
global subquery_retry_pos
if p is None:
print >> sys.stderr, "ERROR: unexpected end of query"
elif p.type == 'SELECT' and parser.symstack[-1].type == 'OPAR':
lexer.input(lexer.lexdata)
subquery_retry_pos = parser.symstack[-1].lexpos
yacc.restart()
else:
print >> sys.stderr, "ERROR: Skipping unrecognized token", p.type, "("+ \
p.value+") at line:", p.lineno, "and column:", find_column(p.lexer.lexdata, p)
# Just discard the token and tell the parser it's okay.
yacc.errok()
def get_token():
global subquery_retry_pos
token = lexer.token()
if token and token.lexpos == subquery_retry_pos:
start_subquery(lexer, lexer.lexpos)
subquery_retry_pos = None
return token
def parse_query(input, debug=0):
lexer.input(inp)
result = parser.parse(inp, tokenfunc=get_token, debug=0)

Efficiently match multiple regexes in Python

Lexical analyzers are quite easy to write when you have regexes. Today I wanted to write a simple general analyzer in Python, and came up with:
import re
import sys
class Token(object):
""" A simple Token structure.
Contains the token type, value and position.
"""
def __init__(self, type, val, pos):
self.type = type
self.val = val
self.pos = pos
def __str__(self):
return '%s(%s) at %s' % (self.type, self.val, self.pos)
class LexerError(Exception):
""" Lexer error exception.
pos:
Position in the input line where the error occurred.
"""
def __init__(self, pos):
self.pos = pos
class Lexer(object):
""" A simple regex-based lexer/tokenizer.
See below for an example of usage.
"""
def __init__(self, rules, skip_whitespace=True):
""" Create a lexer.
rules:
A list of rules. Each rule is a `regex, type`
pair, where `regex` is the regular expression used
to recognize the token and `type` is the type
of the token to return when it's recognized.
skip_whitespace:
If True, whitespace (\s+) will be skipped and not
reported by the lexer. Otherwise, you have to
specify your rules for whitespace, or it will be
flagged as an error.
"""
self.rules = []
for regex, type in rules:
self.rules.append((re.compile(regex), type))
self.skip_whitespace = skip_whitespace
self.re_ws_skip = re.compile('\S')
def input(self, buf):
""" Initialize the lexer with a buffer as input.
"""
self.buf = buf
self.pos = 0
def token(self):
""" Return the next token (a Token object) found in the
input buffer. None is returned if the end of the
buffer was reached.
In case of a lexing error (the current chunk of the
buffer matches no rule), a LexerError is raised with
the position of the error.
"""
if self.pos >= len(self.buf):
return None
else:
if self.skip_whitespace:
m = self.re_ws_skip.search(self.buf[self.pos:])
if m:
self.pos += m.start()
else:
return None
for token_regex, token_type in self.rules:
m = token_regex.match(self.buf[self.pos:])
if m:
value = self.buf[self.pos + m.start():self.pos + m.end()]
tok = Token(token_type, value, self.pos)
self.pos += m.end()
return tok
# if we're here, no rule matched
raise LexerError(self.pos)
def tokens(self):
""" Returns an iterator to the tokens found in the buffer.
"""
while 1:
tok = self.token()
if tok is None: break
yield tok
if __name__ == '__main__':
rules = [
('\d+', 'NUMBER'),
('[a-zA-Z_]\w+', 'IDENTIFIER'),
('\+', 'PLUS'),
('\-', 'MINUS'),
('\*', 'MULTIPLY'),
('\/', 'DIVIDE'),
('\(', 'LP'),
('\)', 'RP'),
('=', 'EQUALS'),
]
lx = Lexer(rules, skip_whitespace=True)
lx.input('erw = _abc + 12*(R4-623902) ')
try:
for tok in lx.tokens():
print tok
except LexerError, err:
print 'LexerError at position', err.pos
It works just fine, but I'm a bit worried that it's too inefficient. Are there any regex tricks that will allow me to write it in a more efficient / elegant way ?
Specifically, is there a way to avoid looping over all the regex rules linearly to find one that fits?
I suggest using the re.Scanner class, it's not documented in the standard library, but it's well worth using. Here's an example:
import re
scanner = re.Scanner([
(r"-?[0-9]+\.[0-9]+([eE]-?[0-9]+)?", lambda scanner, token: float(token)),
(r"-?[0-9]+", lambda scanner, token: int(token)),
(r" +", lambda scanner, token: None),
])
>>> scanner.scan("0 -1 4.5 7.8e3")[0]
[0, -1, 4.5, 7800.0]
You can merge all your regexes into one using the "|" operator and let the regex library do the work of discerning between tokens. Some care should be taken to ensure the preference of tokens (for example to avoid matching a keyword as an identifier).
I found this in python document. It's just simple and elegant.
import collections
import re
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
def tokenize(s):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
('ASSIGN', r':='), # Assignment operator
('END', r';'), # Statement terminator
('ID', r'[A-Za-z]+'), # Identifiers
('OP', r'[+*\/\-]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]'), # Skip over spaces and tabs
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
get_token = re.compile(tok_regex).match
line = 1
pos = line_start = 0
mo = get_token(s)
while mo is not None:
typ = mo.lastgroup
if typ == 'NEWLINE':
line_start = pos
line += 1
elif typ != 'SKIP':
val = mo.group(typ)
if typ == 'ID' and val in keywords:
typ = val
yield Token(typ, val, line, mo.start()-line_start)
pos = mo.end()
mo = get_token(s, pos)
if pos != len(s):
raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))
statements = '''
IF quantity THEN
total := total + price * quantity;
tax := price * 0.05;
ENDIF;
'''
for token in tokenize(statements):
print(token)
The trick here is the line:
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
Here (?P<ID>PATTERN) will mark the matched result with a name specified by ID.
re.match is anchored. You can give it a position argument:
pos = 0
end = len(text)
while pos < end:
match = regexp.match(text, pos)
# do something with your match
pos = match.end()
Have a look for pygments which ships a shitload of lexers for syntax highlighting purposes with different implementations, most based on regular expressions.
It's possible that combining the token regexes will work, but you'd have to benchmark it. Something like:
x = re.compile('(?P<NUMBER>[0-9]+)|(?P<VAR>[a-z]+)')
a = x.match('9999').groupdict() # => {'VAR': None, 'NUMBER': '9999'}
if a:
token = [a for a in a.items() if a[1] != None][0]
The filter is where you'll have to do some benchmarking...
Update: I tested this, and it seems as though if you combine all the tokens as stated and write a function like:
def find_token(lst):
for tok in lst:
if tok[1] != None: return tok
raise Exception
You'll get roughly the same speed (maybe a teensy faster) for this. I believe the speedup must be in the number of calls to match, but the loop for token discrimination is still there, which of course kills it.
This isn't exactly a direct answer to your question, but you might want to look at ANTLR. According to this document the python code generation target should be up to date.
As to your regexes, there are really two ways to go about speeding it up if you're sticking to regexes. The first would be to order your regexes in the order of the probability of finding them in a default text. You could figure adding a simple profiler to the code that collected token counts for each token type and running the lexer on a body of work. The other solution would be to bucket sort your regexes (since your key space, being a character, is relatively small) and then use a array or dictionary to perform the needed regexes after performing a single discrimination on the first character.
However, I think that if you're going to go this route, you should really try something like ANTLR which will be easier to maintain, faster, and less likely to have bugs.

Categories

Resources