Conditional regex replacement - python

With Python, You can check if a group is empty before replacing text?
Example:
[user] John Marshal --> [user]<br><strong>Jonh Marshal<strong>
John Marshal --> <strong>Jonh Marshal<strong>
The regex should use this is, but with the "condition" to insert the < br> only if group 1 is found.
title = re.sub(r'^\s*(\[.*?\])?\s*(.*)', r'\1<br><strong>\2</strong>', title)

Group one is always found because you allow empty matches.
You want to match at least one character, not 0 or more, so use .+?:
title = re.sub(r'^\s*(\[.+?\])?\s*(.*)', r'\1<br><strong>\2</strong>', title)
Now the match will throw an exception if group one is missing. Make use of that:
try:
title = re.sub(r'^\s*(\[.+?\])?\s*(.*)', r'\1<br><strong>\2</strong>', title)
except re.error:
title = re.sub(r'^\s*(.*)', r'<strong>\1</strong>', title)
The alternative is to use a function to do the replacement:
def title_sub(match):
if match.group(1):
return '{}<br><strong>{}</strong>'.format(*match.groups())
return '<strong>{}</strong>'.format(match.group(2))
title = re.sub(r'^\s*(\[.+?\])?\s*(.*)', title_sub, title)
Demo:
>>> re.sub(r'^\s*(\[.+?\])?\s*(.*)', title_sub, '[user] John Marshal')
'[user]<br><strong>John Marshal</strong>'
>>> re.sub(r'^\s*(\[.+?\])?\s*(.*)', title_sub, 'John Marshal')
'<strong>John Marshal</strong>'

To do conditional substitutions with regex in Python I've came up with the following solution:
#classmethod
def normalize_query_string(cls, query_string):
def replace_fields(match):
x = match.group("field")
if x == "$certHash":
return "ci.C.H:"
return "{}:".format(x)
result = re.sub(r"(?P<field>\$[\w.]+):", replace_fields, query_string)
return result

Related

I want to remove the unwanted sub-level duplicate tags using lxml etree

This is the input sample text. I want to do in object based cleanup to avoid hierarchy issues
<p><b><b><i><b><i><b>
<i>sample text</i>
</b></i></b></i></b></b></p>
Required Output
<p><b><i>sample text</i></b></p>
I written this Object based cleanup using lxml for sublevel duplicate tags. It may help others.
import lxml.etree as ET
textcont = '<p><b><b><i><b><i><b><i>sample text</i></b></i></b></i></b></b></p>'
soup = ET.fromstring(textcont)
for tname in ['i','b']:
for tagn in soup.iter(tname):
if tagn.getparent().getparent() != None and tagn.getparent().getparent().tag == tname:
iparOfParent = tagn.getparent().getparent()
iParent = tagn.getparent()
if iparOfParent.text == None:
iparOfParent.addnext(iParent)
iparOfParent.getparent().remove(iparOfParent)
elif tagn.getparent() != None and tagn.getparent().tag == tname:
iParent = tagn.getparent()
if iParent.text == None:
iParent.addnext(tagn)
iParent.getparent().remove(iParent)
print(ET.tostring(soup))
output:
b'<p><b><i>sample text</i></b></p>'
Markdown, itself, provides structural to extract elements inside
Using re in python, you may extract elements and recombine them.
For example:
import re
html = """<p><b><b><i><b><i><b>
<i>sample text</i>
</b></i></b></i></b></b></p>"""
regex_object = re.compile("\<(.*?)\>")
html_objects = regex_object.findall(html)
set_html = []
for obj in html_objects:
if obj[0] != "/" and obj not in set_html:
set_html.append(obj)
regex_text = re.compile("\>(.*?)\<")
text = [result for result in regex_text.findall(html) if result][0]
# Recombine
result = ""
for obj in set_html:
result += f"<{obj}>"
result += text
for obj in set_html[::-1]:
result += f"</{obj}>"
# result = '<p><b><i>sample text</i></b></p>'
You can use the regex library re to create a function to search for the matching opening tag and closing tag pair and everything else in between. Storing tags in a dictionary will remove duplicate tags and maintain the order they were found in (if order isn't important then just use a set). Once all pairs of tags are found, wrap what's left with the keys of the dictionary in reverse order.
import re
def remove_duplicates(string):
tags = {}
while (match := re.findall(r'\<(.+)\>([\w\W]*)\<\/\1\>', string)):
tag, string = match[0][0], match[0][1] # match is [(group0, group1)]
tags.update({tag: None})
for tag in reversed(tags):
string = f'<{tag}>{string}</{tag}>'
return string
Note: I've used [\w\W]* as a cheat to match everything.

Why is my code getting the wrong part of the txt string?

The issue I have is that I'm not able to get the correct part of my txt string. The txt string also has to be in the current order, I can't switch role and name.
txt = \
'''
Company name
leader: Claire
cashier: Ole
'''
def role(name):
start= txt.find(name)-len(name)
end= txt.find('\n', start)
return txt[start:end]
If I type role("Ole") I expect the output to be 'cashier'.
The output I am getting though is 'r: Ole'.
You can create a dictionary that associates the names to the correspondent roles, so that it becomes very easy to retrieve them even in a much longer list:
txt = \
'''
Company name
leader: Claire
cashier: Ole
'''
t = txt.split()
roles = {t[i] : t[i-1].strip(':') for i in range(3, len(t), 2)}
def role(name):
return roles[name]
Once set up, it's pretty intuitive.
stringy = "cashier: Ole"
stringy.partition(':')[0]
you can use .partition and bring only the first element
this will work but if there is anything diff about your string it may break down.
stringy = """ Company name
leader: Claire
cashier: Ole"""
def role(name):
slice = stringy.partition(name)[0]
role_str = slice.split(' ')[-2]
return role_str

Django split String from database

I try to split a String from a database (SQLite). The String has a linebreak \n and I want split it there in 2 parts. It works with a normal String for example text = "Hello \n World". But if I use the string from my database it doesn't work (the text is saved correctly with \n in the database!!)
My Code for getting the first part of the string:
from django import template
from products.models import News
register = template.Library()
#register.simple_tag
def get_first_title(id):
search_value = "\n"
news = News.objects.values('title')
title = news.filter(pk=id)
number = str(title).find(search_value)
first_title = str(title)[0:number]
return first_title
Try this:
#register.simple_tag
def get_first_title(id):
news = News.objects.get(pk=id)
return news.title.split("\n")[0]
Although you should probably catch posible exceptions with:
#register.simple_tag
def get_first_title(id):
try:
news = News.objects.get(pk=id)
splitted_title = news.title.split("\n")
if splitted_title:
return splitted_title[0]
except News.DoesNotExist:
# raise exception or return None or something
Use the following code instead of your code in the last 4 lines
title = news.filter(pk=id)
number = str(title).split(" ")
# number is a list of strings
print(number , len(number))

HTML structure diff in Python

I want to diff html files by structure and not by content. For example: b and a are identical with this diff because the structures of them are equal.
Anyone knows tool (I prefer in python) or implementation do it ?
You need to parse the HTML/XMLto a DOM tree and then compare those trees. The preferred solution for parsin in Python for this is lxml library. For comparison I am not sure any lib exist but below is a guidelining source code.
Here is one XML comparison function from Ian Bicking (orignal source, under Python Software Foundation License, https://bitbucket.org/ianb/formencode/src/tip/formencode/doctest_xml_compare.py?fileviewer=file-view-default#doctest_xml_compare.py-70 )
try:
import doctest
doctest.OutputChecker
except AttributeError: # Python < 2.4
import util.doctest24 as doctest
try:
import xml.etree.ElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
from xml.parsers.expat import ExpatError as XMLParseError
RealOutputChecker = doctest.OutputChecker
def debug(*msg):
import sys
print >> sys.stderr, ' '.join(map(str, msg))
class HTMLOutputChecker(RealOutputChecker):
def check_output(self, want, got, optionflags):
normal = RealOutputChecker.check_output(self, want, got, optionflags)
if normal or not got:
return normal
try:
want_xml = make_xml(want)
except XMLParseError:
pass
else:
try:
got_xml = make_xml(got)
except XMLParseError:
pass
else:
if xml_compare(want_xml, got_xml):
return True
return False
def output_difference(self, example, got, optionflags):
actual = RealOutputChecker.output_difference(
self, example, got, optionflags)
want_xml = got_xml = None
try:
want_xml = make_xml(example.want)
want_norm = make_string(want_xml)
except XMLParseError, e:
if example.want.startswith('<'):
want_norm = '(bad XML: %s)' % e
# '<xml>%s</xml>' % example.want
else:
return actual
try:
got_xml = make_xml(got)
got_norm = make_string(got_xml)
except XMLParseError, e:
if example.want.startswith('<'):
got_norm = '(bad XML: %s)' % e
else:
return actual
s = '%s\nXML Wanted: %s\nXML Got : %s\n' % (
actual, want_norm, got_norm)
if got_xml and want_xml:
result = []
xml_compare(want_xml, got_xml, result.append)
s += 'Difference report:\n%s\n' % '\n'.join(result)
return s
def xml_compare(x1, x2, reporter=None):
if x1.tag != x2.tag:
if reporter:
reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
return False
for name, value in x1.attrib.items():
if x2.attrib.get(name) != value:
if reporter:
reporter('Attributes do not match: %s=%r, %s=%r'
% (name, value, name, x2.attrib.get(name)))
return False
for name in x2.attrib.keys():
if name not in x1.attrib:
if reporter:
reporter('x2 has an attribute x1 is missing: %s'
% name)
return False
if not text_compare(x1.text, x2.text):
if reporter:
reporter('text: %r != %r' % (x1.text, x2.text))
return False
if not text_compare(x1.tail, x2.tail):
if reporter:
reporter('tail: %r != %r' % (x1.tail, x2.tail))
return False
cl1 = x1.getchildren()
cl2 = x2.getchildren()
if len(cl1) != len(cl2):
if reporter:
reporter('children length differs, %i != %i'
% (len(cl1), len(cl2)))
return False
i = 0
for c1, c2 in zip(cl1, cl2):
i += 1
if not xml_compare(c1, c2, reporter=reporter):
if reporter:
reporter('children %i do not match: %s'
% (i, c1.tag))
return False
return True
def text_compare(t1, t2):
if not t1 and not t2:
return True
if t1 == '*' or t2 == '*':
return True
return (t1 or '').strip() == (t2 or '').strip()
def make_xml(s):
return ET.XML('<xml>%s</xml>' % s)
def make_string(xml):
if isinstance(xml, (str, unicode)):
xml = make_xml(xml)
s = ET.tostring(xml)
if s == '<xml />':
return ''
assert s.startswith('<xml>') and s.endswith('</xml>'), repr(s)
return s[5:-6]
def install():
doctest.OutputChecker = HTMLOutputChecker
Sidenote: <\head> is not a valid HTML tag and will be interpreted as text. HTML close tags look like this: </head>
As other answerers may tell you, using a library that actually knows what a DOM is is probably the most reliable option if you're comparing well-structured, complete HTML documents or fragments. A simpler solution than using a DOM is to use regex to match HTML tags.
It's simple (can be done in two lines).
It's reliable in everything I've tested so far, but can give unexpected results when, for example, HTML tags appear in <pre> or <textarea> elements.
Will work with partial HTML fragments like </head>, while DOM/parsing libraries might complain that a <head> tag is missing.
Demo
Following is some code that normalizes HTML input (the HTML of this page, actually) by finding all the tags and printing them in succession.
import re, urllib
f = urllib.urlopen('http://stackoverflow.com/questions/33204018/html-structure-diff-in-python')
html = f.read()
for m in re.finditer(r'''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
print m.group(0)
You can take the output from the above and use whatever command-line diff tool you prefer to compare them.
Or maybe you want to compare them using Python. Instead of printing out all the lines, you might be interested in concatenating them into a single string:
tags_as_string = ''
for m in re.finditer(r'''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
s += m.group(0) + '\n' # the newline makes diff output look nicer
or list:
tags_as_list = []
for m in re.finditer(r'''</?(\w+)((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
s.append(m.group(0))
Further steps to consider (can be done inside the for loop):
Perhaps you're only interested in the tag name and not the attributes. The tag name can be accessed with m.group(1) (the first regex group in parentheses) in the for-loop.
Tags that mean the same thing still might be different due to whitespace. You might want to normalize out the whitespace within each tag using a similar technique.
Credit: The actual regex is from http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/

Django, custom template filters - regex problems

I'm trying to implement a WikiLink template filter in Django that queries the database model to give different responses depending on Page existence, identical to Wikipedia's red links. The filter does not raise an Error but instead doesn't do anything to the input.
WikiLink is defined as: [[ThisIsAWikiLink | This is the alt text]]
Here's a working example that does not query the database:
from django import template
from django.template.defaultfilters import stringfilter
from sites.wiki.models import Page
import re
register = template.Library()
#register.filter
#stringfilter
def wikilink(value):
return re.sub(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]', r'\2', value)
wikilink.is_safe = True
The input (value) is a multi-line string, containing HTML and many WikiLinks.
The expected output is substituting [[ThisIsAWikiLink | This is the alt text]] with
This is the alt text
or if "ThisIsAWikiLink" doesn't exist in the database:
This is the alt text
and returning value.
Here's the non-working code (edited in response to comments/answers):
from django import template
from django.template.defaultfilters import stringfilter
from sites.wiki.models import Page
import re
register = template.Library()
#register.filter
#stringfilter
def wikilink(value):
m = re.match(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]', value)
if(m):
page_alias = m.group(2)
page_title = m.group(3)
try:
page = Page.objects.get(alias=page_alias)
return re.sub(r'(\[\[)(.*)\|(.*)(\]\])', r'\3', value)
except Page.DoesNotExist:
return re.sub(r'(\[\[)(.*)\|(.*)(\]\])', r'\3', value)
else:
return value
wikilink.is_safe = True
What the code needs to do is:
extract all the WikiLinks in value
query the Page model to see if the page exists
substitute all the WikiLinks with normal links, styled dependent on each wikipage existence.
return the altered value
The updated question is:
What regular expression (method) can return a python List of WikiLinks, which can be altered and used to substitute the original matches (after being altered).
Edit:
I'd like to do something like this:
def wikilink(value):
regex = re.magic_method(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]', value)
foreach wikilink in regex:
alias = wikilink.group(0)
text = wikilink.group(1)
if(alias exists in Page):
regex.sub(""+ text +"")
else:
regex.sub("<a href="+alias+" class='redlink'>"+ text +"</a>")
return value
If your string contains other text in addition to the wiki-link, your filter won't work because you are using re.match instead of re.search. re.match matches at the beginning of the string. re.search matches anywhere in the string. See matching vs. searching.
Also, your regex uses the greedy *, so it won't work if one line contains multiple wiki-links. Use *? instead to make it non-greedy:
re.search(r'\[\[(.*?)\|(.*?)\]\]', value)
Edit:
As for tips on how to fix your code, I suggest that you use re.sub with a callback. The advantages are:
It works correctly if you have multiple wiki-links in the same line.
One pass over the string is enough. You don't need a pass to find wiki-links, and another one to do the replacement.
Here is a sketch of the implmentation:
import re
WIKILINK_RE = re.compile(r'\[\[(.*?)\|(.*?)\]\]')
def wikilink(value):
def wikilink_sub_callback(match_obj):
alias = match_obj.group(1).strip()
text = match_obj.group(2).strip()
if(alias exists in Page):
class_attr = ''
else:
class_attr = ' class="redlink"'
return '<a href="%s"%s>%s</a>' % (alias, class_attr, text)
return WIKILINK_RE.sub(wikilink_sub_callback, value)
This is the type of problem that falls quickly to a small set of unit tests.
Pieces of the filter that can be tested in isolation (with a bit of code restructuring):
Determining whether or not value contains the pattern you're looking for
What string gets generated if there is a matching Page
What string gets generated is there isn't a matching Page
That would help you isolate where things are going wrong. You'll probably find that you'll need to rewire the regexps to account for optional spaces around the |.
Also, on first glance it looks like your filter is exploitable. You're claiming the result is safe, but you haven't filtered the alt text for nasties like script tags.
Code:
import re
def page_exists(alias):
if alias == 'ThisIsAWikiLink':
return True
return False
def wikilink(value):
if value == None:
return None
for alias, text in re.findall('\[\[\s*(.*?)\s*\|\s*(.*?)\s*\]\]',value):
if page_exists(alias):
value = re.sub('\[\[\s*%s\s*\|\s*%s\s*\]\]' % (alias,text), '%s' % (alias, text),value)
else:
value = re.sub('\[\[\s*%s\s*\|\s*%s\s*\]\]' % (alias,text), '%s' % (alias, text), value)
return value
Sample results:
>>> import wikilink
>>> wikilink.wikilink(None)
>>> wikilink.wikilink('')
''
>>> wikilink.wikilink('Test')
'Test'
>>> wikilink.wikilink('[[ThisIsAWikiLink | This is the alt text]]')
'This is the alt text'
>>> wikilink.wikilink('[[ThisIsABadWikiLink | This is the alt text]]')
'This is the alt text'
>>> wikilink.wikilink('[[ThisIsAWikiLink | This is the alt text]]\n[[ThisIsAWikiLink | This is another instance]]')
'This is the alt text\nThis is another instance'
>>> wikilink.wikilink('[[ThisIsAWikiLink | This is the alt text]]\n[[ThisIsAWikiLink | This is another instance]]')
General comments:
findall is the magic re function you're looking for
Change page_exists to run whatever query you want
Vulnerable to HTML injection (as mentioned by Dave W. Smith above)
Having to recompile the regex on each iteration is inefficient
Querying the database each time is inefficient
I think you'd run into performance issues pretty quickly with this approach.
This is the working code in case someone needs it:
from django import template
from django.template.defaultfilters import stringfilter
from sites.wiki.models import Page
import re
register = template.Library()
#register.filter
#stringfilter
def wikilink(value):
WIKILINK_RE = re.compile(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]')
def wikilink_sub_callback(match_obj):
alias = match_obj.group(1).strip()
text = match_obj.group(2).strip()
class_attr = ''
try:
Page.objects.get(alias=alias)
except Page.DoesNotExist:
class_attr = ' class="redlink"'
return '<a href="%s"%s>%s</a>' % (alias, class_attr, text)
return WIKILINK_RE.sub(wikilink_sub_callback, value)
wikilink.is_safe = True
Many thanks for all the answers!

Categories

Resources