Lexer for Parsing to the end of a line - python

If I have a keyword, how can I get it to, once it encounters a keyword, to just grab the rest of the line and return it as a string? Once it encounters an end of line, return everything on that line.
Here is the line I'm looking at:
description here is the rest of my text to collect
Thus, when the lexer encounters description, I would like "here is the rest of my text to collect" returned as a string
I have the following defined, but it seems to be throwing an error:
states = (
('bcdescription', 'exclusive'),
)
def t_bcdescription(t):
r'description '
t.lexer.code_start = t.lexer.lexpos
t.lexer.level = 1
t.lexer.begin('bcdescription')
def t_bcdescription_close(t):
r'\n'
t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1]
t.type="BCDESCRIPTION"
t.lexer.lineno += t.valiue.count('\n')
t.lexer.begin('INITIAL')
return t
This is part of the error being returned:
File "/Users/me/Coding/wm/wm_parser/ply/lex.py", line 393, in token
raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
ply.lex.LexError: Illegal character ' ' at index 40
Finally, if I wanted this functionality for more than one token, how could I accomplish that?
Thanks for your time

There is no big problem with your code,in fact,i just copy your code and run it,it works well
import ply.lex as lex
states = (
('bcdescription', 'exclusive'),
)
tokens = ("BCDESCRIPTION",)
def t_bcdescription(t):
r'\bdescription\b'
t.lexer.code_start = t.lexer.lexpos
t.lexer.level = 1
t.lexer.begin('bcdescription')
def t_bcdescription_close(t):
r'\n'
t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1]
t.type="BCDESCRIPTION"
t.lexer.lineno += t.value.count('\n')
t.lexer.begin('INITIAL')
return t
def t_bcdescription_content(t):
r'[^\n]+'
lexer = lex.lex()
data = 'description here is the rest of my text to collect\n'
lexer.input(data)
while True:
tok = lexer.token()
if not tok: break
print tok
and result is :
LexToken(BCDESCRIPTION,' here is the rest of my text to collect\n',1,50)
So maybe your can check other parts of your code
and if I wanted this functionality for more than one token, then you can simply capture words and when there comes a word appears in those tokens, start to capture the rest of content by the code above.

It is not obvious why you need to use a lexer/parser for this without further information.
>>> x = 'description here is the rest of my text to collect'
>>> a, b = x.split(' ', 1)
>>> a
'description'
>>> b
'here is the rest of my text to collect'

Related

Specific words from a text file with PLY

I am making a lexical analyzer for determined words that are in a .txt file, for this I declare determined words reserved and I try to print only the selected words on the screen, but the result I get is that it takes all the words in the txt file and prints them. I've been following the tutorial and the official Ply documentation in http://www.dabeaz.com/ply/ply.html#ply_nn6 but I still don't achieve my goal. Could someone help me with this? Thank you very much.
import ply.lex as lex
import re
import os
import sys
reservadas = {
'if' : 'if',
'then' : 'then',
'else' : 'else',
'while' : 'while',
}
tokens = ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE',
'ODD','ASSIGN','NE','LT','LTE','GT','GTE',
'LPARENT', 'RPARENT','COMMA','SEMMICOLOM',
'DOT','UPDATE'
] + list(reservadas.values())
#tokens = tokens+reservadas
# reservadas = {
# 'begin':'BEGIN',
# 'end':'END',
# 'if':'IF',
# 'then':'THEN',
# 'while':'WHILE',
# 'do':'DO',
# 'call':'CALL',
# 'const':'CONST',
# 'int':'VAR',
# 'procedure':'PROCEDURE',
# 'out':'OUT',
# 'in':'IN',
# 'else':'ELSE'
# }
#tokens = tokens+list(reservadas.values())
t_ignore = '\t '
t_ignore_PLUS = r'\+'
t_ignore_MINUS = r'\-'
t_ignore_TIMES = r'\*'
t_ignore_DIVIDE = r'/'
t_ignore_ODD = r'ODD'
t_ignore_ASSIGN = r'='
t_ignore_NE = r'<>'
t_ignore_LT = r'<'
t_ignore_LTE = r'<='
t_ignore_GT = r'>'
t_ignore_GTE = r'>='
t_ignore_LPARENT = r'\('
t_ignore_RPARENT = r'\)'
t_ignore_COMMA = r','
t_ignore_SEMMICOLOM = r';'
t_ignore_DOT = r'\.'
t_ignore_UPDATE = r':='
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
t.type = reservadas.get(t.value,'ID') # Check for reserved words
return t
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
#dsfjksdlgjklsdgjsdgslxcvjlk-,.
def t_COMMENT(t):
r'\//.*'
r'\/*.*'
r'\*/.*'
pass
def t_NUMBER(t):
r'\d+'
t.value = int(t.value)
pass
def t_error(t):
print ("----- '%s'" % t.value[0])
t.lexer.skip(1)
while True:
tok = analizador.token()
if not tok : break
print (tok)
the output I get with the above code is:
LexToken(ID,'FSR',1,3)
LexToken(ID,'testing',1,7)
LexToken(ID,'sketch',1,15)
'---- '
'---- '
LexToken(ID,'Connect',3,28)
LexToken(ID,'one',3,36)
LexToken(ID,'end',3,40)
LexToken(ID,'of',3,44)
LexToken(ID,'FSR',3,47)
LexToken(ID,'to',3,51)
LexToken(ID,'V',3,55)
LexToken(ID,'the',3,58)
LexToken(ID,'other',3,62)
LexToken(ID,'end',3,68)
LexToken(ID,'to',3,72)
LexToken(ID,'Analog',3,75)
'---- '
.
.
.
.
LexToken(ID,'Serial',21,694)
LexToken(ID,'print',21,701)
----- '"'
LexToken(ID,'Analog',21,708)
LexToken(ID,'reading',21,715)
----- '"'
'---- '
LexToken(ID,'Serial',22,732)
LexToken(ID,'println',22,739)
LexToken(ID,'fsrReading',22,747)
'---- '
'---- '
LexToken(ID,'LEDbrightness',26,898)
LexToken(ID,'map',26,914)
LexToken(ID,'fsrReading',26,918)
'---- '
LexToken(ID,'analogWrite',28,996)
LexToken(ID,'LEDpin',28,1008)
LexToken(ID,'LEDbrightness',28,1016)
'---- '
LexToken(ID,'IF',29,1034)
'---- '
LexToken(if,'if',30,1038)
'---- '
LexToken(ID,'delay',31,1044)
'---- '
----- '}'
Press any key to continue . . .
my expectation for the exit would be this:
LexToken(ID,'IF',29,1034)
'---- '
LexToken(if,'if',30,1038)
I am analyzing a code of arduino, and all those words are comments, but I only need you to look for the conditionals if or IF, or other reserved words like for, but the main idea is that with a list of reserved words you identify them and show me only those selected
If you want to discard tokens that are not in your 'reserved' list, adjust the t_ID function like so:
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
reserved_type = reservadas.get(t.value, False)
if reserved_type:
t.type = reserved_type
return t # Return token with reserved type
return None # Discard non-reserved tokens
Additionally, your comment token function is probably misapplied here.
def t_COMMENT(t):
r'\//.*'
r'\/*.*'
r'\*/.*'
pass
You can't use multiple rules or span a rule over multiple strings like this. Because the docstring (which ply uses to get the regex) will only contain the very first string.
Secondly, I think the regex needs adjusting for comments, assuming you're tokenizing C or a C-like language. Particularly, it needs to account for the possibility that comments span multiple lines.
To fix, apply the following for dealing with comments:
def t_block_comment(tok):
r'/\*((.|\n))*?\*/'
tok.lexer.lineno += tok.value.count('\n')
return None # Discard block comments "/* comment */"
t_ignore_comment = r'//.*' # ignore inline comments "// comment"
You may also need to apply the regex multiline flag:
analizador = lex.lex(reflags=re.MULTILINE)
Lastly, your t_ignore_DIVIDE = r'/' may be preventing your comment rules from applying, too. Consider ordering this after the comment rules.

Checking a text segment within brackets with python

I have a text file, which is strucutred as following:
segmentA {
content Aa
content Ab
content Ac
....
}
segmentB {
content Ba
content Bb
content Bc
......
}
segmentC {
content Ca
content Cb
content Cc
......
}
I know how to search certrain strings through the whole text file, but how can i define to search for a certain string whithin, like example, "segmentC". I need something like reg expression to tell the script??:
If text beginn with "segmentC {" perform a search of a certain string until the first "}" appears.
Someone an idea?
Thanks in advance!
Not a RegEx solution ...but would do the work!
def SearchStuff(lines,sstr):
i=0
while(lines[i]!='}'):
#Do stuffff .....for e.g.
if 'Ca' in lines[i]:
return lines[i]
i+=1
def main(search_str):
f=open('file.txt','r')
lines = f.readlines()
f.close()
for line in lines:
if search_str in line:
index = lines.index(line)
break
lines = lines[index+1:]
print SearchStuff(lines,search_str)
search_str = 'segmentC' #set this string accordingly
main(search_str)
Depending on the complexity you are looking for, you can range from a simple state machine with line based pattern searching to a full lexer.
Line based search
The below example makes the assumption that you are only looking for one segment and that segmentC { and the closing } are on one single line.
def parsesegment(fh):
# Yields all lines inside "segmentC"
state = "out"
for line in fh:
line = line.strip() # in case there are whitespaces around
if state == "out":
if line.startswith("segmentC {"):
state = "in"
break
elif state == "in":
if line.startswith("}"):
state = "out"
break
# Work on the specific lines here
yield line
with open(...) as fh:
for line in parsesegment(fh):
# do something
Simple Lexer
If you need more flexibility, you can design a simple lexer/parser couple. For example, the following code makes no assumption to the organisation of the syntax between lines. It also ignores unknown pattern, which a typical lexer do not (normally it should raise a syntax error):
import re
class ParseSegment:
# Dictionary of patterns per state
# Tuples are (token name, pattern, state change command)
_regexes = {
"out": [
("open", re.compile(r"segment(?P<segment>\w+)\s+\{"), "in")
],
"in": [
("close", re.compile(r"\}"), "out"),
# Here an example of what you could want to match
("content", re.compile(r"content\s+(?P<content>\w+)"), None)
]
}
def lex(self, source, initpos = 0):
pos = initpos
end = len(source)
state = "out"
while pos < end:
for token_name, reg, state_chng in self._regexes[state]:
# Try to get a match
match = reg.match(source, pos)
if match:
# Advance according to how much was matched
pos = match.end()
# yield a token if it has a name
if token_name is not None:
# Yield token name, the full matched part of source
# and the match grouped according to (?P<tag>) tags
yield (token_name, match.group(), match.groupdict())
# Switch state if requested
if state_chng is not None:
state = state_chng
break
else:
# No match, advance by one character
# This is particular to that lexer, usually no match means
# the input file has an error in the syntax and lexer should
# yield an exception
pos += 1
def parse(self, source, initpos = 0):
# This is an example of use of the lexer with a parser
# This converts the input file into a dictionary. Keys are segment
# names, and values are list of contents.
segments = {}
cur_segment = None
# Use lexer to get tokens from source
for token, fullmatch, groups in self.lex(source, initpos):
# On open, create the list of content in segments
if token == "open":
cur_segment = groups["segment"]
segments[cur_segment] = []
# On content, ensure we know the segment and add content to the
# list
elif token == "content":
if cur_segment is None:
raise RuntimeError("Content found outside a segment")
segments[cur_segment].append(groups["content"])
# On close, set the current segment to unknown
elif token == "close":
cur_segment = None
# ignore unknown tokens, we could raise an error instead
return segments
def main():
with open("...", "r") as fh:
data = fh.read()
lexer = ParseSegment()
segments = lexer.parse(data)
print(segments)
return 0
if __name__ == '__main__':
main()
Full Lexer
Then if you need even more flexibility and reuseability, you will have to create a full parser. No need to reinvent the wheel, have a look at this list of language parsing modules, you will probably find the one that suits you.

parsing a string in python for #hashtag

I am wondering, how could I make an algorithm that parses a string for the hashtag symbol ' # ' and returns the full string, but where ever a word starts with a '#' symbol, it becomes a link. I am using python with Google app engine: webapp2 and Jinja2 and I am building a blog.
Thanks
A more efficient and complete way to find the "hashwords":
import functools
def hash_position(string):
return string.find('#')
def delimiter_position(string, delimiters):
positions = filter(lambda x: x >= 0, map(lambda delimiter: string.find(delimiter), delimiters))
try:
return functools.reduce(min, positions)
except TypeError:
return -1
def get_hashed_words(string, delimiters):
maximum_length = len(string)
current_hash_position = hash_position(string)
string = string[current_hash_position:]
results = []
counter = 0
while current_hash_position != -1:
current_delimiter_position = delimiter_position(string, delimiters)
if current_delimiter_position == -1:
results.append(string)
else:
results.append(string[0:current_delimiter_position])
# Update offsets and the haystack
string = string[current_delimiter_position:]
current_hash_position = hash_position(string)
string = string[current_hash_position:]
return results
if __name__ == "__main__":
string = "Please #clarify: What do you #mean with returning somthing as a #link. #herp"
delimiters = [' ', '.', ',', ':']
print(get_hashed_words(string, delimiters))
Imperative code with updates of the haystack looks a little bit ugly but hey, that's what we get for (ab-)using mutable variables.
And I still have no idea what do you mean with "returning something as a link".
Hope that helps.
not sure where do you get the data for the link, but maybe something like:
[('%s' % word) for word in input.split() if word[0]=='#']
Are you talking about twitter? Maybe this?
def get_hashtag_link(hashtag):
if hashtag.startswith("#"):
return '%s' % (hashtag[1:], hashtag)
>>> get_hashtag_link("#stackoverflow")
'#stackoverflow'
It will return None if hashtag is not a hashtag.

Find strings that begins with a '#' and create link

I want to check whether a string (a tweet) begins with a '#' (i.e. is a hashtag) or not, and if so create a link.
Below is what I've tried so far but it doesn't work (error on the last line).
How can I fix this and will the code work for the purpose?
tag_regex = re.compile(r"""
[\b#\w\w+] # hashtag found!""", re.VERBOSE)
message = raw_message
for tag in tag_regex.findall(raw_message):
message = message.replace(url, '' + message + '')
>>> msg = '#my_tag the rest of my tweet'
>>> re.sub('^#(\w+) (.*)', r'\2', msg)
'the rest of my tweet'
>>>

Problems porting from Python to Ruby

I have a neat little script in python that I would like to port to Ruby and I think it's highlighting my noobishness at Ruby. I'm getting the error that there is an unexpected END statement, but I don't see how this can be so. Perhaps there is a keyword that requires an END or something that doesn't want an END that I forgot about. Here is all of the code leading up to the offending line Offending line is commented.
begin
require base64
require base32
rescue LoadError
puts "etext requires base32. use 'gem install --remote base32' and try again"
end
# Get a string from a text file from disk
filename = ARGV.first
textFile = File.open(filename)
text = textFile.read()
mailType = "text only" # set the default mailType
#cut the email up by sections
textList1 = text.split(/\n\n/)
header = textList1[0]
if header.match (/MIME-Version/)
mailType = "MIME"
end
#If mail has no attachments, parse as text-only. This is the class that does this
class TextOnlyMailParser
def initialize(textList)
a = 1
body = ""
header = textList[0]
#parsedEmail = Email.new(header)
while a < textList.count
body += ('\n' + textList[a] + '\n')
a += 1
end
#parsedEmail.body = body
end
end
def separate(text,boundary = nil)
# returns list of strings and lists containing all of the parts of the email
if !boundary #look in the email for "boundary= X"
text.scan(/(?<=boundary=).*/) do |bound|
textList = recursiveSplit(text,bound)
end
return textList
end
if boundary
textList = recursiveSplit(text,boundary)
end
end
def recursiveSplit(chunk,boundary)
if chunk.is_a? String
searchString = "--" + boundary
ar = cunk.split(searchString)
return ar
elsif chunk.is_a? Array
chunk do |bit|
recursiveSplit(bit,boundary);
end
end
end
class MIMEParser
def initialize(textList)
#textList = textList
#nestedItems = []
newItem = NestItem.new(self)
newItem.value = #textList[0]
newItem.contentType = "Header"
#nestedItems.push(newItem)
#setup parsed email
#parsedEmail = Email.new(newItem.value)
self._constructNest
end
def checkForContentSpecial(item)
match = item.value.match (/Content-Disposition: attachment/)
if match
filename = item.value.match (/(?<=filename=").+(?=")/)
encoding = item.value.match (/(?<=Content-Transfer-Encoding: ).+/)
data = item.value.match (/(?<=\n\n).*(?=(\n--)|(--))/m)
dataGroup = data.split(/\n/)
dataString = ''
i = 0
while i < dataGroup.count
dataString += dataGroup[i]
i ++
end #<-----THIS IS THE OFFENDING LINE
#parsedEmail.attachments.push(Attachment.new(filename,encoding,dataString))
end
Your issue is the i ++ line, Ruby does not have a post or pre increment/decrement operators and the line is failing to parse. I can't personally account as to why i++ evaluates in IRB but i ++ does not perform any action.
Instead replace your ++ operators with += 1 making that last while:
while i < dataGroup.count
dataString += dataGroup[i]
i += 1
end
But also think about the ruby way, if you're just adding that to a string why not do a dataString = dataGroup.join instead of looping over with a while construct?

Categories

Resources