Splitting bracket delimited text which can contain quoted strings - python

I am trying to split some text. Basically I want to separate level-1 brackets, like "('1','a',NULL),(2,'b')" => ["('1','a',NULL)", "(2,'b')]", but I need to be aware of possible quoted strings inside. It needs to at least satisfy the following py.tests:
from splitter import split_text
def test_normal():
assert split_text("('1'),('2')") == ["('1')", "('2')"]
assert split_text("(1),(2),(3)") == ["(1)", "(2)", "(3)"]
def test_complex():
assert split_text("('1','a'),('2','b')") == ["('1','a')", "('2','b')"]
assert split_text("('1','a',NULL),(2,'b')") == ["('1','a',NULL)", "(2,'b')"]
def test_apostrophe():
assert split_text("('\\'1','a'),('2','b')") == ["('\\'1','a')", "('2','b')"]
def test_coma_in_string():
assert split_text("('1','a,c'),('2','b')") == ["('1','a,c')", "('2','b')"]
def test_bracket_in_string():
assert split_text("('1','a)c'),('2','b')") == ["('1','a)c')", "('2','b')"]
def test_bracket_and_coma_in_string():
assert split_text("('1','a),(c'),('2','b')") == ["('1','a),(c')", "('2','b')"]
def test_bracket_and_coma_in_string_apostrophe():
assert split_text("('1','a\\'),(c'),('2','b')") == ["('1','a\\'),(c')", "('2','b')"]
I have tried the following:
1) Regular expressions
This looks like the best solution, but unfortunately I did not come up with anything satisfying all tests.
My best try is:
def split_text(text):
return re.split('(?<=\)),(?=\()', text)
But obviously, that is rather simplistic and fails test_bracket_and_coma_in_string and test_bracket_and_coma_in_string_apostrophe.
2) Finite-state-machine-like solution
I tried to code the FSM myself:
OUTSIDE, IN_BRACKETS, IN_STRING, AFTER_BACKSLASH = range(4)
def split_text(text):
state = OUTSIDE
read = []
result = []
for character in text:
if state == OUTSIDE:
if character == ',':
result.append(''.join(read))
read = []
elif character == '(':
read.append(character)
state = IN_BRACKETS
else:
read.append(character)
elif state == IN_BRACKETS:
read.append(character)
if character == ')':
state = OUTSIDE
elif character == "'":
state = IN_STRING
elif state == IN_STRING:
read.append(character)
if character == "'":
state = IN_BRACKETS
elif character == '\\':
state = AFTER_BACKSLASH
elif state == AFTER_BACKSLASH:
read.append(character)
state = IN_STRING
result.append(''.join(read)) # The rest of string
return result
It works, passes all tests, but is very slow.
3) pyparsing
from pyparsing import QuotedString, ZeroOrMore, Literal, Group, Suppress, Word, nums
null_value = Literal('NULL')
number_value = Word(nums)
string_value = QuotedString("'", escChar='\\', unquoteResults=False)
value = null_value | number_value | string_value
one_bracket = Group(Literal('(') + value + ZeroOrMore(Literal(',') + value) + Literal(')'))
all_brackets = one_bracket + ZeroOrMore(Suppress(',') + one_bracket)
def split_text(text):
parse_result = all_brackets.parseString(text)
return [''.join(a) for a in parse_result]
Also passes all tests, but surprisingly it is even slower than solution #2.
Any ideas how to make the solution fast and robust? I have this feeling that I am missing something obvious.

One way would be to use the newer regex module which supports the (*SKIP)(*FAIL) functionality:
import regex as re
def split_text(text):
rx = r"""'.*?(?<!\\)'(*SKIP)(*FAIL)|(?<=\)),(?=\()"""
return re.split(rx, text)
Broken down it says:
'.*?(?<!\\)' # look for a single quote up to a new single quote
# that MUST NOT be escaped (thus the neg. lookbehind)
(*SKIP)(*FAIL)| # these parts shall fail
(?<=\)),(?=\() # your initial pattern with a positive lookbehind/ahead
This succeeds on all your examples.

I cooked this and it works on given tests.
tests = ["('1'),('2')",
"(1),(2),(3)",
"('1','a'),('2','b')",
"('1','a',NULL),(2,'b')",
"('\\'1','a'),('2','b')",
"('1','a,c'),('2','b')",
"('1','a)c'),('2','b')",
"('1','a),(c'),('2','b')",
"('1','a\\'),(c'),('2','b')"]
for text in tests:
tmp = ''
res = []
bracket = 0
quote = False
for idx,i in enumerate(text):
if i=="'":
if text[idx-1]!='\\':
quote = not quote
tmp += i
elif quote:
tmp += i
elif i==',':
if bracket: tmp += i
else: pass
else:
if i=='(': bracket += 1
elif i==')': bracket -= 1
if bracket: tmp += i
else:
tmp += i
res.append(tmp)
tmp = ''
print res
Output:
["('1')", "('2')"]
['(1)', '(2)', '(3)']
["('1','a')", "('2','b')"]
["('1','a',NULL)", "(2,'b')"]
["('\\'1','a')", "('2','b')"]
["('1','a,c')", "('2','b')"]
["('1','a)c')", "('2','b')"]
["('1','a),(c')", "('2','b')"]
["('1','a\\'),(c')", "('2','b')"]
The code has room for improvement, and edits are welcome. :)

This is the regular expression which seems to work and passes all the tests. Running it on real data it is about 6x faster than finite state machine implemented in Python.
PATTERN = re.compile(
r"""
\( # Opening bracket
(?:
# String
(?:'(?:
(?:\\')|[^'] # Either escaped apostrophe, or other character
)*'
)
|
# or other literal not containing right bracket
[^')]
)
(?:, # Zero or more of them separated with comma following the first one
# String
(?:'(?:
(?:\\')|[^'] # Either escaped apostrophe, or other character
)*'
)
|
# or other literal
[^')]
)*
\) # Closing bracket
""",
re.VERBOSE)
def split_text(text):
return PATTERN.findall(text)

Related

A more pythonic way to check for duplicate characters and check that they are next to each other?

Is there a more pythonic way to check if there are two '((' AND '))' in the string? I've made an attempt but it does not look very pythonic
def check (string):
text = list(string)
prev = text[0]
curr = text[1]
first=False
second=False
for nxt in text[2:]:
if (prev == "(" and curr == "("):
first = True
elif (curr == "(" and nxt == "("):
first = True
elif (prev == ")" and curr == ")"):
second = True
elif (curr == ")" and nxt == ")"):
second = True
prev = curr
curr = nxt
if (first == True and second == True):
return "true"
else:
return "false"
examples:
check("((a)) + b") # true
check("((a) + b") # false
check("(a+b))") # false
check("((a+b))") # true
check("((a)(b))") # true
check("(((a)))") # true
check("a)) + ((b") # true
If all you want is "has adjacent parens of each type", the solution is just:
def check(string):
return '((' in string and '))' in string
A regex could be used to confirm they appear in the correct order and reduce the work to a single pass over string, not two, but if you don't need correct "logical" order, the incremental work of scanning twice is pretty meaningless 99% of the time. A regex solution would just be:
import re
def check(string):
return re.search(r'\(\(.*\)\)', string, re.DOTALL) is not None

Balanced String Recursion Returns Improperly

I'm currently working on a problem to write a recursive program to remove all the balanced bracket operators from a string or return False if the string is not balanced. I can get the program to remove all the brackets but, according to the debugger, when the program does its final base case check to verify that the string is empty, the program jumps from return True in line 3 to isBalanced recursive call in line 10. I don't understand why this is happening. Code is the following:
def isBalanced(string):
if not string: # Base Case. If the string is empty then return True
return True
else:
j = 0
for i in string: # Iterate thru the str looking for (), {}. and [] pairs, looking for closed bracket first
if (i == ')') or (i == ']') or (i == '}'):
if (i == ')') and (string[j-1] == '('):
new_string = string[:j-1] + string[j+1:] # Remove ()
isBalanced(new_string)
elif (i == ']') and (string[j-1] == '['):
new_string = string[:j-1] + string[j+1:] # Remove []
isBalanced(new_string)
elif (i == '}') and (string[j-1] == '{'):
new_string = string[:j-1] + string[j+1:] # Remove {}
isBalanced(new_string)
else: # Did not find an open bracket to match a closed bracket operator
print('Program failed at:', string)
return False
else:
j += 1 # Index counter
test_str = "({[]()})"
print(isBalanced(test_str))

How can I capitalize the first letter of a string in Python, ignoring HTML tags?

I would like to capitalize the first letter of a string, ignoring HTML tags. For instance:
hello world
should become:
Hello world
I wrote the following, which works, but it seems inefficient, since every character of the string is being copied to the output. Is there a better way to do it?
#register.filter
def capinit(value):
gotOne = False
inTag = False
outValue = ''
for c in value:
cc = c
if c == '<':
inTag = True
if c == '>':
inTag = False
if not inTag:
if c.isalpha() or c.isdigit():
if not gotOne:
cc = c.upper()
gotOne = True
outValue = outValue + cc
return outValue
Note that this ignores initial punctuation. It will capitalize the first letter it finds, unless it finds a number first in which case it doesn't capitalize anything.
I tried to do what you wanted:
html = 'hello world'
afterletter = None
dontcapital = 0
afterhtml = ""
for character in html:
if character == "/" and afterletter == "<":
afterhtml += character
dontcapital = 1
elif afterletter == ">":
if dontcapital == 0:
afterhtml += character.upper()
else:
afterhtml += character
dontcapital = 0
else:
afterhtml += character
afterletter = character
print(afterhtml)
#afterhtml is the output!
this should work from all the tests i did.
if anyone wants to work on it you can.

Bracket balancing algorithm doesn't detect imbalanced brackets

The code takes in any combination of brackets and checks if they are balanced or not. If they are balanced it should output success; if they aren't balanced it should output the index (starting at index 1) where the brackets are not balanced.
Example:
Input: ())
Output: 3
\\
Input: ()
Output: Success
The code always displays "Success" regardless of it being balanced or not.
Instead i get this:
Input: ())
Output: Success
import sys
def Match(self, c):
if self == '[' and c == ']':
return True
if self == '{' and c == '}':
return True
if self == '(' and c == ')':
return True
else:
return False
if __name__ == "__main__":
text = sys.stdin.read()
char_code = 0
opening_brackets_stack = []
for i, next in enumerate(text):
if next == '(' or next == '[' or next == '{':
char_code += 1
opening_brackets_stack.append(next)
stack_pop = opening_brackets_stack.pop()
if next == ')' or next == ']' or next == '}':
char_code += 1
if not Match(stack_pop, next):
print(char_code)
else:
char_code += 1
print ('Success')
Your code is printing "Success" because you've told it that after it finishes it should always print success
if __name__ == "__main__":
# A bunch of stuff unrelated to program flow...
print ('Success')
You probably only want success if you've reached the end of your text with nothing in the queue.
if __name__ == "__main__":
text = sys.stdin.read()
char_code = 0
opening_brackets_stack = []
for i, next in enumerate(text):
if next == '(' or next == '[' or next == '{':
char_code += 1
opening_brackets_stack.append(next)
stack_pop = opening_brackets_stack.pop()
if next == ')' or next == ']' or next == '}':
char_code += 1
if not Match(stack_pop, next):
print(char_code)
else:
char_code += 1
if not opening_brackets_stack: # <-- new line
print ('Success')
Except this won't solve your problem either, since you've never properly checked if you have an unmatched closing bracket, only an unmatched opening bracket. Consider this, instead:
# this will let us check for an expected closing bracket more easily
opening_brackets = "([{"
closing_brackets = ")]}"
mapping = dict(zip(opening_brackets, closing_brackets))
stack = []
for i, ch in enumerate(text):
if ch in opening_brackets:
# throw the closing bracket on the stack
matching_closer = mapping[ch]
stack.append(matching_closer)
elif ch == stack[-1]:
# if the character closes the last-opened bracket
stack.pop() # pop it off
elif ch in closing_brackets:
# this is an unmatched closing bracket, making the brackets
# imbalanced in this expression
print("FAILED")
sys.exit(1) # closes the program immediately with a retcode of 1
else:
# not a bracket, continue as normal
# this is technically a NOP and everything from the `else` can be
# omitted, but I think this looks more obvious to the reader.
continue
if not stack: # empty stack means matched brackets!
print("SUCCESS")
else:
print("FAILED")
Code can contain any brackets from the set []{}(), where the opening brackets are [,{, and ( and the closing brackets corresponding to them are ],}, and ).
For convenience, the text editor should not only inform the user that there is an error in the usage of brackets, but also point to the exact place in the code with the problematic bracket. First priority is to find the first unmatched closing bracket which either doesn’t have an opening bracket before it, like ] in ](), or closes the wrong opening bracket, like } in ()[}. If there are no such mistakes, then it should find the first unmatched opening bracket without the corresponding closing bracket after it, like ( in {}([]. If there are no mistakes, text editor should inform the user that the usage of brackets is correct.
Apart from the brackets, code can contain big and small latin letters, digits and punctuation marks.
More formally, all brackets in the code should be divided into pairs of matching brackets, such that in each pair the opening bracket goes before the closing bracket, and for any two pairs of brackets either one of them is nested inside another one as in (foo[bar]) or they are separate as in f(a,b)-g[c]. The bracket [ corresponds to the bracket ], { corresponds to }, and ( corresponds to ).
# python3
from collections import namedtuple
Bracket = namedtuple("Bracket", ["char", "position"])
def are_matching(left, right):
return (left + right) in ["()", "[]", "{}"]
def find_mismatch(text):
opening_brackets_stack = []
mismatch_pos = None
for i, next in enumerate(text):
if next in "([{":
# Process opening bracket, write your code here
opening_brackets_stack.append(next)
if len(opening_brackets_stack) < 2:
mismatch_pos = Bracket(next, i + 1).position
if next in ")]}":
# Process closing bracket, write your code here
if len(opening_brackets_stack) == 0:
return Bracket(next, i + 1).position
top = opening_brackets_stack.pop()
if not are_matching(top, next):
return Bracket(next, i + 1).position
if len(opening_brackets_stack) == 0:
return "Success"
return mismatch_pos
def main():
text = input()
mismatch = find_mismatch(text)
# Printing answer, write your code here
print(mismatch)
if __name__ == "__main__":
main()

docstring blocks elif statement

Let me past the exact code I have:
This is the short module
class SentenceSplitter:
def __init__(self, filename=None):
self._raw_text = self.raw_text(filename)
self._sentences = self.to_sentences()
def raw_text(self, filename):
text = ''
with open(filename, 'r') as file:
for line in file.readlines():
line = line.strip()
text += ''.join(line.replace(line, line+' '))
file.close()
text = text.strip() # Deal with the last whitespace
return text
def to_sentences(self):
""" Sentence boundaries occur at '.', '!', '?' except that,
there are some not-sentence boundaries that
may occur before/after the period.
"""
raw_text = self._raw_text
sentences = []
sentence = ''
boundary = None
for char in raw_text:
sentence += ''.join(char)
if char == '!' or char == '?':
sentences.append(sentence)
sentence = ''
""" The sign -> refers to 'followed by' """
elif char == '.':
i = raw_text.index(char) # slicing previous/following characters
boundary = True
if boundary:
sentences.append(sentence)
sentence = ''
return sentences
And the main:
import textchange
ss = textchange.SentenceSplitter(filename='text.txt')
print(ss._sentences)
The docstring after the first if statement
""" The sign -> refers to 'followed by' """
I commented it out and the program runs, else does not.
There is more code in the elif statement but removed it after making sure it still throwing error. Here is the traceback:
Traceback (most recent call last):
File "D:\Programs\Python 3.3.2\Tutorials\46 Simple Python Exercises.py", line 26, in
<module>
import textchange
File "D:\Programs\Python 3.3.2\Tutorials\textchange.py", line 51
elif char == '.':
^
SyntaxError: invalid syntax
Docstrings are just string literals that are found at the start of the function. They still have to follow indentation rules.
Your string is not correctly indented for the elif block; by being de-dented from the if block before, you ended the if-elif-else blocks altogether and no elif is permitted to follow.
Use a regular, normal comment instead, a line starting with #; lines that contain only comments are exempt from the indentation rules:
if char == '!' or char == '?':
sentences.append(sentence)
sentence = ''
# The sign -> refers to 'followed by'
elif char == '.':
i = raw_text.index(char) # slicing previous/following characters
boundary = True
or indent the string (which is entirely still executed by Python as code, but otherwise not assigned and thus discarded again):
if char == '!' or char == '?':
sentences.append(sentence)
sentence = ''
elif char == '.':
""" The sign -> refers to 'followed by' """
i = raw_text.index(char) # slicing previous/following characters
boundary = True

Categories

Resources