Header for XML files with xml.etree.ElementTree [duplicate] - python

I have tried to use the answer in this question, but can't make it work: How to create "virtual root" with Python's ElementTree?
Here's my code:
import xml.etree.cElementTree as ElementTree
from StringIO import StringIO
s = '<?xml version=\"1.0\" encoding=\"UTF-8\" ?><!DOCTYPE tmx SYSTEM \"tmx14a.dtd\" ><tmx version=\"1.4a\" />'
tree = ElementTree.parse(StringIO(s)).getroot()
header = ElementTree.SubElement(tree,'header',{'adminlang': 'EN',})
body = ElementTree.SubElement(tree,'body')
ElementTree.ElementTree(tree).write('myfile.tmx','UTF-8')
When I open the resulting 'myfile.tmx' file, it contains this:
<?xml version='1.0' encoding='UTF-8'?>
<tmx version="1.4a"><header adminlang="EN" /><body /></tmx>
What am I missing? or, is there a better tool?

You could set xml_declaration argument on write function to False, so output won't have xml declaration with encoding, then just append what header you need manually. Actually if you set your encoding as 'utf-8' (lowercase), xml declaration won't be added too.
import xml.etree.cElementTree as ElementTree
tree = ElementTree.Element('tmx', {'version': '1.4a'})
ElementTree.SubElement(tree, 'header', {'adminlang': 'EN'})
ElementTree.SubElement(tree, 'body')
with open('myfile.tmx', 'wb') as f:
f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE tmx SYSTEM "tmx14a.dtd">'.encode('utf8'))
ElementTree.ElementTree(tree).write(f, 'utf-8')
Resulting file (newlines added manually for readability):
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE tmx SYSTEM "tmx14a.dtd">
<tmx version="1.4a">
<header adminlang="EN" />
<body />
</tmx>

You could use lxml and its tostring function:
from lxml import etree
s = """<?xml version="1.0" encoding="UTF-8"?>
<tmx version="1.4a"/>"""
tree = etree.fromstring(s)
header = etree.SubElement(tree,'header',{'adminlang': 'EN'})
body = etree.SubElement(tree,'body')
print etree.tostring(tree, encoding="UTF-8",
xml_declaration=True,
pretty_print=True,
doctype='<!DOCTYPE tmx SYSTEM "tmx14a.dtd">')
=>
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE tmx SYSTEM "tmx14a.dtd">
<tmx version="1.4a">
<header adminlang="EN"/>
<body/>
</tmx>

I used different solution to add DOCTYPE, very simple, very stupid.
import xml.etree.ElementTree as ET
with open(path_file, "w", encoding='UTF-8') as xf:
doc_type = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE dlg:window ' \
'PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "dialog.dtd">'
tostring = ET.tostring(root).decode('utf-8')
file = f"{doc_type}{tostring}"
xf.write(file)

I couldn't find a solution to this problem either using vanilla ElementTree, and the solution proposed by demalexx created non-valid XML that was rejected by my application (DITA).
What I propose is a workaround involving other modules and it works perfectly for me.
import re
# found no way for cleanly specify a <!DOCTYPE ...> stanza in ElementTree so
# so we substitute the current <?xml ... ?> stanza with a full <?xml... + <!DOCTYPE...
new_header = '<?xml version="1.0" encoding="UTF-8" ?>\n' \
'<!DOCTYPE topic PUBLIC "-//OASIS//DTD DITA Topic//EN" "topic.dtd">\n'
target_xml = re.sub(u"\<\?xml .+?>", new_header, source_xml)
with open(filename, 'w') as catalog_file:
catalog_file.write(target_xml.encode('utf8'))

Related

Exporting XML header and doctype to XML file [duplicate]

I have tried to use the answer in this question, but can't make it work: How to create "virtual root" with Python's ElementTree?
Here's my code:
import xml.etree.cElementTree as ElementTree
from StringIO import StringIO
s = '<?xml version=\"1.0\" encoding=\"UTF-8\" ?><!DOCTYPE tmx SYSTEM \"tmx14a.dtd\" ><tmx version=\"1.4a\" />'
tree = ElementTree.parse(StringIO(s)).getroot()
header = ElementTree.SubElement(tree,'header',{'adminlang': 'EN',})
body = ElementTree.SubElement(tree,'body')
ElementTree.ElementTree(tree).write('myfile.tmx','UTF-8')
When I open the resulting 'myfile.tmx' file, it contains this:
<?xml version='1.0' encoding='UTF-8'?>
<tmx version="1.4a"><header adminlang="EN" /><body /></tmx>
What am I missing? or, is there a better tool?
You could set xml_declaration argument on write function to False, so output won't have xml declaration with encoding, then just append what header you need manually. Actually if you set your encoding as 'utf-8' (lowercase), xml declaration won't be added too.
import xml.etree.cElementTree as ElementTree
tree = ElementTree.Element('tmx', {'version': '1.4a'})
ElementTree.SubElement(tree, 'header', {'adminlang': 'EN'})
ElementTree.SubElement(tree, 'body')
with open('myfile.tmx', 'wb') as f:
f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE tmx SYSTEM "tmx14a.dtd">'.encode('utf8'))
ElementTree.ElementTree(tree).write(f, 'utf-8')
Resulting file (newlines added manually for readability):
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE tmx SYSTEM "tmx14a.dtd">
<tmx version="1.4a">
<header adminlang="EN" />
<body />
</tmx>
You could use lxml and its tostring function:
from lxml import etree
s = """<?xml version="1.0" encoding="UTF-8"?>
<tmx version="1.4a"/>"""
tree = etree.fromstring(s)
header = etree.SubElement(tree,'header',{'adminlang': 'EN'})
body = etree.SubElement(tree,'body')
print etree.tostring(tree, encoding="UTF-8",
xml_declaration=True,
pretty_print=True,
doctype='<!DOCTYPE tmx SYSTEM "tmx14a.dtd">')
=>
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE tmx SYSTEM "tmx14a.dtd">
<tmx version="1.4a">
<header adminlang="EN"/>
<body/>
</tmx>
I used different solution to add DOCTYPE, very simple, very stupid.
import xml.etree.ElementTree as ET
with open(path_file, "w", encoding='UTF-8') as xf:
doc_type = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE dlg:window ' \
'PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "dialog.dtd">'
tostring = ET.tostring(root).decode('utf-8')
file = f"{doc_type}{tostring}"
xf.write(file)
I couldn't find a solution to this problem either using vanilla ElementTree, and the solution proposed by demalexx created non-valid XML that was rejected by my application (DITA).
What I propose is a workaround involving other modules and it works perfectly for me.
import re
# found no way for cleanly specify a <!DOCTYPE ...> stanza in ElementTree so
# so we substitute the current <?xml ... ?> stanza with a full <?xml... + <!DOCTYPE...
new_header = '<?xml version="1.0" encoding="UTF-8" ?>\n' \
'<!DOCTYPE topic PUBLIC "-//OASIS//DTD DITA Topic//EN" "topic.dtd">\n'
target_xml = re.sub(u"\<\?xml .+?>", new_header, source_xml)
with open(filename, 'w') as catalog_file:
catalog_file.write(target_xml.encode('utf8'))

How to output XML declaration <?xml version="1.0"?> in Python/ElementTree

I'm trying to create a XML file for the word reference source file which is in XML. When I write to the file, with only "xml_decaration=True" it shows <?xml version='1.0' encoding='us-ascii'?> but I want it in the form <?xml version="1.0"?>.
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import Element
import xml.etree.ElementTree as ET
import uuid
from lxml import etree
root=Element('b:sources')
root.set('SelectedStyle','')
root.set('xmlns:b','http://schemas.openxmlformats.org/officeDocument/2006/bibliography')
root.set('xmlns','http://schemas.openxmlformats.org/officeDocument/2006/bibliography')
#root.attrib=('SelectedStyle'='', 'xmlns:b'='"http://schemas.openxmlformats.org/officeDocument/2006/bibliography"', 'xmlns:b'='"http://schemas.openxmlformats.org/officeDocument/2006/bibliography"','xmlns'='"http://schemas.openxmlformats.org/officeDocument/2006/bibliography"')
source=ET.SubElement(root, 'b:source')
ET.SubElement(source,'b:Tag')
ET.SubElement(source,'b:SourceType').text='Misc'
ET.SubElement(source,'b:guid').text=str(uuid.uuid1())
Author=ET.SubElement(source,'b:Author')
Author2=ET.SubElement(Author,'b:Author')
ET.SubElement(Author2,'b:Corporate').text='Norsk olje og gass'
ET.SubElement(source, 'b:Title').text='R-002'
ET.SubElement(source, 'b:Year').text='2019'
ET.SubElement(source, 'b:Month').text='10'
ET.SubElement(source, 'b:Day').text='27'
tree=ElementTree(root)
tree.write('Sources.xml', xml_declaration=True, method='xml')
Answer:
When using xml.etree.ElementTree there is no way to avoid the inclusion of an encoding attribute in the declaration. If you don't want an encoding attribute in the XML declaration at all, you need to use xml.dom.minidom not xml.etree.ElementTree.
Here is a snippet to setup an example:
import xml.etree.ElementTree
a = xml.etree.ElementTree.Element('a')
tree = xml.etree.ElementTree.ElementTree(element=a)
root = tree.getroot()
Omit Encoding:
out = xml.etree.ElementTree.tostring(root, xml_declaration=True)
b"<?xml version='1.0' encoding='us-ascii'?>\n<a />"
Encoding us-ascii:
out = xml.etree.ElementTree.tostring(root, encoding='us-ascii', xml_declaration=True)
b"<?xml version='1.0' encoding='us-ascii'?>\n<a />"
Encoding unicode:
out = xml.etree.ElementTree.tostring(root, encoding='unicode', xml_declaration=True)
"<?xml version='1.0' encoding='UTF-8'?>\n<a />"
Using minidom:
Let's take the first example from above with the encoding omitted and use the variable out as the input to xml.dom.minidom and you will see the output that you're seeking.
import xml.dom.minidom
dom = xml.dom.minidom.parseString(out)
dom.toxml()
'<?xml version="1.0" ?><a/>'
There is also a pretty print option:
dom.toprettyxml()
'<?xml version="1.0" ?>\n<a/>\n'
Note
Take a look at the source code, and you can see that the encoding is hard coded in the output.
with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
if method == "xml" and (xml_declaration or
(xml_declaration is None and
declared_encoding.lower() not in ("utf-8", "us-ascii"))):
write("<?xml version='1.0' encoding='%s'?>\n" % (
declared_encoding,))
https://github.com/python/cpython/blob/550c44b89513ea96d209e2ff761302238715f082/Lib/xml/etree/ElementTree.py#L731-L736

Read a non formatted xml and export it again formatted? [duplicate]

Here is the code but the exported xml appears badly formatted.
import xml.etree.ElementTree as ET
import os
sampleXML = """<?xml version="1.0" encoding="ASCII"?>
<Metadata version="1.0">
<CODE_OK>510</CODE_OK>
<DeliveryDate>13/08/2018</DeliveryDate>
</Metadata>
"""
tree = ET.ElementTree(ET.fromstring(sampleXML))
for folder in os.listdir("YourPath"): #Iterate the dir
tree.find("CODE_OK").text = folder #Update dir name in XML
tree.write(open(os.path.join(r"Path", folder, "newxml.xml"), "wb")) #Write to XML
How to make the exported xml appear normally formatted?
I found in docs that xml module has an implementation of Document Object Model interface. I provide a simple example
from xml.dom.minidom import parseString
example = parseString(sampleXML) # your string
# write to file
with open('file.xml', 'w') as file:
example.writexml(file, indent='\n', addindent=' ')
Output:
<?xml version="1.0" ?>
<Metadata version="1.0">
<CODE_OK>510</CODE_OK>
<DeliveryDate>13/08/2018</DeliveryDate>
</Metadata>
Update
You can also write like this
example = parseString(sampleXML).toprettyxml()
with open('file.xml', 'w') as file:
file.write(example)
Output:
<?xml version="1.0" ?>
<Metadata version="1.0">
<CODE_OK>510</CODE_OK>
<DeliveryDate>13/08/2018</DeliveryDate>
</Metadata>
Update 2
I copy all your code and only add indent from this site. And for me is working correctly
import xml.etree.ElementTree as ET
import os
sampleXML = "your xml"
tree = ET.ElementTree(ET.fromstring(sampleXML))
indent(tree.getroot()) # this I add
for folder in os.listdir(path):
tree.find("CODE_OK").text = folder
tree.write(open(os.path.join(path, folder, "newxml.xml"), "wb"))

xml_declaration = True <?xml version="1.0" encoding="UTF-8"?>

I am using this code:
tree.write(xmlFileOut, pretty_print = True, xml_declaration = True, encoding='UTF-8')
to write my xml with xml declaration but it is producing:
<?xml version='1.0' encoding='UTF-8'?>
But I need it to produce:
<?xml version="1.0" encoding="UTF-8"?>
I am using python with lxml.
What do I need to do?
Cheers.

How to create <!DOCTYPE> with Python's cElementTree

I have tried to use the answer in this question, but can't make it work: How to create "virtual root" with Python's ElementTree?
Here's my code:
import xml.etree.cElementTree as ElementTree
from StringIO import StringIO
s = '<?xml version=\"1.0\" encoding=\"UTF-8\" ?><!DOCTYPE tmx SYSTEM \"tmx14a.dtd\" ><tmx version=\"1.4a\" />'
tree = ElementTree.parse(StringIO(s)).getroot()
header = ElementTree.SubElement(tree,'header',{'adminlang': 'EN',})
body = ElementTree.SubElement(tree,'body')
ElementTree.ElementTree(tree).write('myfile.tmx','UTF-8')
When I open the resulting 'myfile.tmx' file, it contains this:
<?xml version='1.0' encoding='UTF-8'?>
<tmx version="1.4a"><header adminlang="EN" /><body /></tmx>
What am I missing? or, is there a better tool?
You could set xml_declaration argument on write function to False, so output won't have xml declaration with encoding, then just append what header you need manually. Actually if you set your encoding as 'utf-8' (lowercase), xml declaration won't be added too.
import xml.etree.cElementTree as ElementTree
tree = ElementTree.Element('tmx', {'version': '1.4a'})
ElementTree.SubElement(tree, 'header', {'adminlang': 'EN'})
ElementTree.SubElement(tree, 'body')
with open('myfile.tmx', 'wb') as f:
f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE tmx SYSTEM "tmx14a.dtd">'.encode('utf8'))
ElementTree.ElementTree(tree).write(f, 'utf-8')
Resulting file (newlines added manually for readability):
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE tmx SYSTEM "tmx14a.dtd">
<tmx version="1.4a">
<header adminlang="EN" />
<body />
</tmx>
You could use lxml and its tostring function:
from lxml import etree
s = """<?xml version="1.0" encoding="UTF-8"?>
<tmx version="1.4a"/>"""
tree = etree.fromstring(s)
header = etree.SubElement(tree,'header',{'adminlang': 'EN'})
body = etree.SubElement(tree,'body')
print etree.tostring(tree, encoding="UTF-8",
xml_declaration=True,
pretty_print=True,
doctype='<!DOCTYPE tmx SYSTEM "tmx14a.dtd">')
=>
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE tmx SYSTEM "tmx14a.dtd">
<tmx version="1.4a">
<header adminlang="EN"/>
<body/>
</tmx>
I used different solution to add DOCTYPE, very simple, very stupid.
import xml.etree.ElementTree as ET
with open(path_file, "w", encoding='UTF-8') as xf:
doc_type = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE dlg:window ' \
'PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "dialog.dtd">'
tostring = ET.tostring(root).decode('utf-8')
file = f"{doc_type}{tostring}"
xf.write(file)
I couldn't find a solution to this problem either using vanilla ElementTree, and the solution proposed by demalexx created non-valid XML that was rejected by my application (DITA).
What I propose is a workaround involving other modules and it works perfectly for me.
import re
# found no way for cleanly specify a <!DOCTYPE ...> stanza in ElementTree so
# so we substitute the current <?xml ... ?> stanza with a full <?xml... + <!DOCTYPE...
new_header = '<?xml version="1.0" encoding="UTF-8" ?>\n' \
'<!DOCTYPE topic PUBLIC "-//OASIS//DTD DITA Topic//EN" "topic.dtd">\n'
target_xml = re.sub(u"\<\?xml .+?>", new_header, source_xml)
with open(filename, 'w') as catalog_file:
catalog_file.write(target_xml.encode('utf8'))

Categories

Resources