Normalize XML text node in Python minidom - python

I want to insert this string:
No, on the 5<Font Script="super">th</Font>
as a Text Node in XML by xml.dom.minidom createTextNode(), however, after I writexml() to a file, the signs:
< > "
turns to:
No, on the 5<Font Script="super">th</Font>
How can I avoid this? Thanks.
A part of my code:
impl = minidom.getDOMImplementation()
dom = impl.createDocument(None, None, None)
TextTextNode = dom.createTextNode(text.decode("utf-8"))
Text = dom.createElement("Text")
Text.appendChild(TextTextNode)
fileToWrite = codecs.open(output, 'w', encoding='utf-8')
dom.writexml(fileToWrite, indent=" ", addindent=" ", newl="\n", encoding='utf-8')
fileToWrite.close()
There is a sample for this by the cinecanvase specification:
<Text HAlign=”left” HPosition=”10.2” VAlign=”bottom” VPosition=”10.0”> This <Font Script=”super”>word </Font>is superscript </Text >
I need insert the <Font>..</Font> into another element, the .

I'm not familiar with that format, but that thing looks like an XML node. Try this:
from xml.dom import minidom
import codecs
output = "test.xml"
text="No, on the 5"
impl = minidom.getDOMImplementation()
dom = impl.createDocument(None, None, None)
FontNode = dom.createElement("Font")
FontNode.setAttribute('Script', 'super')
FontNode.appendChild(dom.createTextNode('th'))
Text = dom.createElement("Text")
TextTextNode = dom.createTextNode(text.decode("utf-8"))
Text.appendChild(TextTextNode)
Text.appendChild(FontNode)
fileToWrite = codecs.open(output, 'w', encoding='utf-8')
Text.writexml(fileToWrite, indent=" ", addindent=" ", newl="\n")
fileToWrite.close()
That outputs:
<Text>
No, on the 5
<Font Script="super">th</Font>
</Text>
Be aware that what you want to write a tree in a file (when you call writexml) you need to call the writexml method with your XML's tree root (you were calling it with dom, not with your root node)

Related

How to indent my xml data which in xml file python [duplicate]

After reading from an existing file with 'ugly' XML and doing some modifications, pretty printing doesn't work. I've tried etree.write(FILE_NAME, pretty_print=True).
I have the following XML:
<testsuites tests="14" failures="0" disabled="0" errors="0" time="0.306" name="AllTests">
<testsuite name="AIR" tests="14" failures="0" disabled="0" errors="0" time="0.306">
....
And I use it like this:
tree = etree.parse('original.xml')
root = tree.getroot()
...
# modifications
...
with open(FILE_NAME, "w") as f:
tree.write(f, pretty_print=True)
For me, this issue was not solved until I noticed this little tidbit here:
http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output
Short version:
Read in the file with this command:
>>> parser = etree.XMLParser(remove_blank_text=True)
>>> tree = etree.parse(filename, parser)
That will "reset" the already existing indentation, allowing the output to generate it's own indentation correctly. Then pretty_print as normal:
>>> tree.write(<output_file_name>, pretty_print=True)
Well, according to the API docs, there is no method "write" in the lxml etree module. You've got a couple of options in regards to getting a pretty printed xml string into a file. You can use the tostring method like so:
f = open('doc.xml', 'w')
f.write(etree.tostring(root, pretty_print=True))
f.close()
Or, if your input source is less than perfect and/or you want more knobs and buttons to configure your out put you could use one of the python wrappers for the tidy lib.
http://utidylib.berlios.de/
import tidy
f.write(tidy.parseString(your_xml_str, **{'output_xml':1, 'indent':1, 'input_xml':1}))
http://countergram.com/open-source/pytidylib
from tidylib import tidy_document
document, errors = tidy_document(your_xml_str, options={'output_xml':1, 'indent':1, 'input_xml':1})
f.write(document)
fp = file('out.txt', 'w')
print(e.tree.tostring(...), file=fp)
fp.close()
Here is an answer that is fixed to work with Python 3:
from lxml import etree
from sys import stdout
from io import BytesIO
parser = etree.XMLParser(remove_blank_text = True)
file_obj = BytesIO(text)
tree = etree.parse(file_obj, parser)
tree.write(stdout.buffer, pretty_print = True)
where text is the xml code as a sequence of bytes.
I am not sure why other answers did not mention this. If you want to obtain the root of the xml there is a method called getroot(). I hope I answered your question (though a little late).
tree = et.parse(xmlFile)
root = tree.getroot()
Of course - pretty print of lxml.etree is possible.
In my case, the old trick with remove_blank_text=True and pretty_print=True was not working as I expected (was too delicate), so I decided to write it by myself.
Here is it - a modern, forcible, native pythonic way to correct lxml.etee.Element tree indentation.
This gives a nicely prettified XML string:
from typing import Optional
import lxml.etree
def indent_lxml(element: lxml.etree.Element, level: int = 0, is_last_child: bool = True) -> None:
space = " "
indent_str = "\n" + level * space
element.text = strip_or_null(element.text)
if element.text:
element.text = f"{indent_str}{space}{element.text}"
num_children = len(element)
if num_children:
element.text = f"{element.text or ''}{indent_str}{space}"
for index, child in enumerate(element.iterchildren()):
is_last = index == num_children - 1
indent_lxml(child, level + 1, is_last)
elif element.text:
element.text += indent_str
tail_level = max(0, level - 1) if is_last_child else level
tail_indent = "\n" + tail_level * space
tail = strip_or_null(element.tail)
element.tail = f"{indent_str}{tail}{tail_indent}" if tail else tail_indent
def strip_or_null(text: Optional[str]) -> Optional[str]:
if text is not None:
return text.strip() or None
It's decent fast, because it doesn't allocate any additional structures in memory and also traversing the tree - it visits each node only once, giving the best possible - O x N computational complexity.
It rearranges all the existing indentation "in place" in the tree (the DOM) by correcting contents of Element.text and Element.tail attributes (affects white-spaces only).
Naturally, it also can be used with HTML parsed by lxml.
In order to use it, do something like that:
root = lxml.etree.parse("path/to/the_file.xml").getroot()
# or
root = lxml.etree.fromstring("<xml><body><leaf1/><leaf2/></body></xml>")
indent_lxml(root) # corrects indentation "in place"
result = lxml.etree.tostring(root, encoding="unicode")
print(result)
Which prints:
<xml>
<body>
<leaf1/>
<leaf2/>
</body>
</xml>

Don't encode Element text object using Python ElementTree

I'm trying to use HTML data inside an the text node of an element, but it gets
encoded as if it were meant to not be HTML data.
Here is an MWE:
from xml.etree import ElementTree as ET
data = 'Example data gained from elsewhere.'
p = ET.Element('p')
p.text = data
p = ET.tostring(p, encoding='utf-8', method='html').decode('utf8')
print(p)
The output is...
<p><a href="https://example.com">Example data gained from elsewhere.</a></p>
What I intended is...
<p>Example data gained from elsewhere.</p>
What you are doing is wrong. You are assigning p.text = data, which basically considers the node to be text content. Its quite obvious the text is escaped.
You have to add it as a child. like below:
from xml.etree import ElementTree as ET
data = 'Example data gained from elsewhere.'
d = ET.fromstring(data)
p = ET.Element('p')
p.append(d)
p = ET.tostring(p, encoding='utf-8', method='html').decode('utf8')
print(p)
Giving output
<p>Example data gained from elsewhere.</p>
You can parse the HTML string into an ElementTree object and append it to the DOM:
from xml.etree import ElementTree as ET
data = 'Example data gained from elsewhere.'
p = ET.Element('p')
p.append(ET.fromstring(data))
p = ET.tostring(p, encoding='utf-8', method='html').decode('utf8')
print(p)

How to avoid double escape using XML

I'm using python to make a program which will have to write data in a XML tag of a specific file.
The line of data I'm willing to write is the following.
<Stream>XXXX-XXXX-XXXX-XXXX?p=0</Stream><URL>rtmp://a.rtmp.youtube.com/live2</URL>
But what I get in my XML file after writing is pretty different.
&lt;Stream&gt;XXXX-XXXX-XXXX-XXXX?p=0&lt;/Stream&gt;&lt;URL&gt;rtmp://a.rtmp.youtube.com/live2&lt;/URL&gt;
The &lt and &gt are here for purpose, and are NOT < and >. I need to keep this formatting but when I use the export as xml file, it replaces all the & by &
I use this code to write data in the xml file:
from lxml import etree as ET
Name_with_single_quote= """IF [Calculation_1] = 'Day-1' THEN [begintime] + 1
ELSEIF[Calculation_1] < 'Day-2' THEN [begintime] + 2
ELSEIF [Calculation_1] > "Day-3" THEN [begintime] + 3
ELSE [begintime]
END"""
Name_with_single_quote = Name_with_single_quote.replace("\n", "
").replace("<", "<").replace("'", "&apos;").replace(">",">").replace("\"", """)
Name_with_single_quote = str(Name_with_single_quote)
xml = """<?xml version="1.0"?>
<column role="dimension" type="nominal" name="[Calculation_1]" datatype="boolean" caption="">
<calculation formula=""/>
</column>"""
tree = ET.fromstring(xml)
formula = tree.find('.//calculation')
formula.set('formula', Name_with_single_quote)
from xml.dom import minidom
xmlstr = minidom.parseString(ET.tostring(tree)).toprettyxml()
xmlstr = '\n'.join(list(filter(lambda x: len(x.strip()), xmlstr.split('\n'))))
with open('test_for_esc_result.xml', "w") as f:
f.write(xmlstr)

How to properly format xml file using lxml? [duplicate]

After reading from an existing file with 'ugly' XML and doing some modifications, pretty printing doesn't work. I've tried etree.write(FILE_NAME, pretty_print=True).
I have the following XML:
<testsuites tests="14" failures="0" disabled="0" errors="0" time="0.306" name="AllTests">
<testsuite name="AIR" tests="14" failures="0" disabled="0" errors="0" time="0.306">
....
And I use it like this:
tree = etree.parse('original.xml')
root = tree.getroot()
...
# modifications
...
with open(FILE_NAME, "w") as f:
tree.write(f, pretty_print=True)
For me, this issue was not solved until I noticed this little tidbit here:
http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output
Short version:
Read in the file with this command:
>>> parser = etree.XMLParser(remove_blank_text=True)
>>> tree = etree.parse(filename, parser)
That will "reset" the already existing indentation, allowing the output to generate it's own indentation correctly. Then pretty_print as normal:
>>> tree.write(<output_file_name>, pretty_print=True)
Well, according to the API docs, there is no method "write" in the lxml etree module. You've got a couple of options in regards to getting a pretty printed xml string into a file. You can use the tostring method like so:
f = open('doc.xml', 'w')
f.write(etree.tostring(root, pretty_print=True))
f.close()
Or, if your input source is less than perfect and/or you want more knobs and buttons to configure your out put you could use one of the python wrappers for the tidy lib.
http://utidylib.berlios.de/
import tidy
f.write(tidy.parseString(your_xml_str, **{'output_xml':1, 'indent':1, 'input_xml':1}))
http://countergram.com/open-source/pytidylib
from tidylib import tidy_document
document, errors = tidy_document(your_xml_str, options={'output_xml':1, 'indent':1, 'input_xml':1})
f.write(document)
fp = file('out.txt', 'w')
print(e.tree.tostring(...), file=fp)
fp.close()
Here is an answer that is fixed to work with Python 3:
from lxml import etree
from sys import stdout
from io import BytesIO
parser = etree.XMLParser(remove_blank_text = True)
file_obj = BytesIO(text)
tree = etree.parse(file_obj, parser)
tree.write(stdout.buffer, pretty_print = True)
where text is the xml code as a sequence of bytes.
I am not sure why other answers did not mention this. If you want to obtain the root of the xml there is a method called getroot(). I hope I answered your question (though a little late).
tree = et.parse(xmlFile)
root = tree.getroot()
Of course - pretty print of lxml.etree is possible.
In my case, the old trick with remove_blank_text=True and pretty_print=True was not working as I expected (was too delicate), so I decided to write it by myself.
Here is it - a modern, forcible, native pythonic way to correct lxml.etee.Element tree indentation.
This gives a nicely prettified XML string:
from typing import Optional
import lxml.etree
def indent_lxml(element: lxml.etree.Element, level: int = 0, is_last_child: bool = True) -> None:
space = " "
indent_str = "\n" + level * space
element.text = strip_or_null(element.text)
if element.text:
element.text = f"{indent_str}{space}{element.text}"
num_children = len(element)
if num_children:
element.text = f"{element.text or ''}{indent_str}{space}"
for index, child in enumerate(element.iterchildren()):
is_last = index == num_children - 1
indent_lxml(child, level + 1, is_last)
elif element.text:
element.text += indent_str
tail_level = max(0, level - 1) if is_last_child else level
tail_indent = "\n" + tail_level * space
tail = strip_or_null(element.tail)
element.tail = f"{indent_str}{tail}{tail_indent}" if tail else tail_indent
def strip_or_null(text: Optional[str]) -> Optional[str]:
if text is not None:
return text.strip() or None
It's decent fast, because it doesn't allocate any additional structures in memory and also traversing the tree - it visits each node only once, giving the best possible - O x N computational complexity.
It rearranges all the existing indentation "in place" in the tree (the DOM) by correcting contents of Element.text and Element.tail attributes (affects white-spaces only).
Naturally, it also can be used with HTML parsed by lxml.
In order to use it, do something like that:
root = lxml.etree.parse("path/to/the_file.xml").getroot()
# or
root = lxml.etree.fromstring("<xml><body><leaf1/><leaf2/></body></xml>")
indent_lxml(root) # corrects indentation "in place"
result = lxml.etree.tostring(root, encoding="unicode")
print(result)
Which prints:
<xml>
<body>
<leaf1/>
<leaf2/>
</body>
</xml>

How can I parse the below XML data using Python?

Source XML
<?xml version='1.0' encoding='UTF-8'?>
<ProcessType xmlns:xmi="http://www.omg.org/XMI" xmi:version="2.0" defaultContext="Default">
<node componentName="tRedshiftRow" componentVersion="0.102" offsetLabelX="0" offsetLabelY="0" posX="-32" posY="96">
<elementParameter field="TECHNICAL" name="QUERYSTORE:QUERYSTORE_TYPE" value="BUILT_IN"/>
<elementParameter field="TEXT" name="DBNAME" value=""""/>
<elementParameter field="TEXT" name="SCHEMA_DB" value=""""/>
<elementParameter field="MEMO_SQL" name="QUERY" value=""DELETE FROM schema.tablename;""/>
</node>
</ProcessType>
I want to get the DELETE statement only where tag is "QUERY", and write it in a text file.
Expected output : DELETE FROM schema.tablename;
I was trying the following way, which obviously didn't work out !
from lxml import etree, objectify
import xml.etree.ElementTree as ET
def convert_xml_to_comp():
metadata = 'source.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
for elem in root.getiterator():
# print(elem)
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1 :]
objectify.deannotate(root, cleanup_namespaces=True)
tree.write('done.xml', pretty_print=True, xml_declaration=True, encoding='UTF-8')
tree = ET.parse('done.xml')
root = tree.getroot()
def get_sql_text():
file = open( "newdelete.txt", "w")
for root in tree.getroot():
### Get the elements' names ###
for elementParameter in root.iterfind('elementParameter[#name="UNIQUE_NAME"]') :
name=elementParameter.get('value')
### Get the elements' name and SQL ###
for elementParameter in root.iterfind('elementParameter[#name="QUERY"]') :
#print (root.attrib)
val=elementParameter.get('value')
print(root.find('val[#value="DELETE FROM schema.tablename;"]'))
file.close()
get_sql_text()
if __name__ == '__main__':
convert_xml_to_comp()
You do this all in a just a couple of statements using an xpath query. Something like:
>>> from lxml import etree
>>> doc = etree.parse(open('data.xml'))
>>> query = doc.xpath('//elementParameter[#name="QUERY"]')[0].get('value')
>>> print(query)
"DELETE FROM schema.tablename;"
This says "find all the elementParameter elements with name="QUERY" and then return the value of the value attribute of the first one.
To select just those elements that contain "DELETE" in their value attribute, use the contains() function:
>>> doc.xpath('//elementParameter[#name="QUERY" and contains(#value, "DELETE")]')

Categories

Resources