Write ElementTree directly to zip with utf-8 encoding - python

I want to modify a large number of XMLs. They are stored in ZIP-files. The source-XMLs are utf-8 encoded (at least to the guesses of the file tool on Linux) and have a correct XML declaration:
<?xml version='1.0' encoding='UTF-8'?>.
The target ZIPs and the XMLs contained therein should also have the correct XML declaration. However, the (at least to me) most obvious method (using ElementTree.tostring) fails.
Here is a self-contained example, that should work out of the box.
Short walkthrough:
imports
preparations (creating src.zip, these ZIPs are a given in my actual application)
actual work of program (modifying XMLs), starting at # read XMLs from zip
Please focus on the lower part, especially # APPROACH 1, APPROACH 2, APPROACH 3:
import os
import tempfile
import zipfile
from xml.etree.ElementTree import Element, parse
src_1 = os.path.join(tempfile.gettempdir(), "one.xml")
src_2 = os.path.join(tempfile.gettempdir(), "two.xml")
src_zip = os.path.join(tempfile.gettempdir(), "src.zip")
trgt_appr1_zip = os.path.join(tempfile.gettempdir(), "trgt_appr1.zip")
trgt_appr2_zip = os.path.join(tempfile.gettempdir(), "trgt_appr2.zip")
trgt_appr3_zip = os.path.join(tempfile.gettempdir(), "trgt_appr3.zip")
# file on hard disk that must be used due to ElementTree insufficiencies
tmp_xml_name = os.path.join(tempfile.gettempdir(), "curr_xml.tmp")
# prepare src.zip
tree1 = ElementTree(Element('hello', {'beer': 'good'}))
tree1.write(os.path.join(tempfile.gettempdir(), "one.xml"), encoding="UTF-8", xml_declaration=True)
tree2 = ElementTree(Element('scnd', {'äkey': 'a value'}))
tree2.write(os.path.join(tempfile.gettempdir(), "two.xml"), encoding="UTF-8", xml_declaration=True)
with zipfile.ZipFile(src_zip, 'a') as src:
with open(src_1, 'r', encoding="utf-8") as one:
string_representation = one.read()
# write to zip
src.writestr(zinfo_or_arcname="one.xml", data=string_representation.encode("utf-8"))
with open(src_2, 'r', encoding="utf-8") as two:
string_representation = two.read()
# write to zip
src.writestr(zinfo_or_arcname="two.xml", data=string_representation.encode("utf-8"))
os.remove(src_1)
os.remove(src_2)
# read XMLs from zip
with zipfile.ZipFile(src_zip, 'r') as zfile:
updated_trees = []
for xml_name in zfile.namelist():
curr_file = zfile.open(xml_name, 'r')
tree = parse(curr_file)
# modify tree
updated_tree = tree
updated_tree.getroot().append(Element('new', {'newkey': 'new value'}))
updated_trees.append((xml_name, updated_tree))
for xml_name, updated_tree in updated_trees:
# write to target file
with zipfile.ZipFile(trgt_appr1_zip, 'a') as trgt1_zip, zipfile.ZipFile(trgt_appr2_zip, 'a') as trgt2_zip, zipfile.ZipFile(trgt_appr3_zip, 'a') as trgt3_zip:
#
# APPROACH 1 [DESIRED, BUT DOES NOT WORK]: write tree to zip-file
# encoding in XML declaration missing
#
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='UTF-8', method='xml')
# write XML directly to zip
trgt1_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
#
# APPROACH 2 [WORKS IN THEORY, BUT DOES NOT WORK]: write tree to zip-file
# encoding in XML declaration is faulty (is 'utf8', should be 'utf-8' or 'UTF-8')
#
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='utf8', method='xml')
# write XML directly to zip
trgt2_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)
#
# APPROACH 3 [WORKS, BUT LACKS PERFORMANCE]: write to file, then read from file, then write to zip
#
# write to file
updated_tree.write(tmp_xml_name, encoding="UTF-8", method="xml", xml_declaration=True)
# read from file
with open(tmp_xml_name, 'r', encoding="utf-8") as tmp:
string_representation = tmp.read()
# write to zip
trgt3_zip.writestr(zinfo_or_arcname=xml_name, data=string_representation.encode("utf-8"))
os.remove(tmp_xml_name)
APPROACH 3 works, but it is much more resource-intensive than the other two.
APPROACH 2 is the only way I could get an ElementTree object to be written with an actual XML declaration -- which then turns out to be invalid (utf8 instead of UTF-8/utf-8).
APPROACH 1 would be most desired -- but fails during reading later in the pipeline, as the XML declaration is missing.
Question: How can I get rid of writing the whole XML to disk first, only to read it afterwards, write it to the zip and delete it after being done with the zip? What am I missing?

You can use an io.BytesIO object.
This allows using ElementTree.write, while avoiding exporting the tree to disk:
import zipfile
from io import BytesIO
from xml.etree.ElementTree import ElementTree, Element
tree = ElementTree(Element('hello', {'beer': 'good'}))
bio = BytesIO()
tree.write(bio, encoding='UTF-8', xml_declaration=True)
with zipfile.ZipFile('/tmp/test.zip', 'w') as z:
z.writestr('test.xml', bio.getvalue())
If you are using Python 3.6 or higher, there's an even shorter solution:
you can get a writable file object from the ZipFile object, which you can pass to ElementTree.write:
import zipfile
from xml.etree.ElementTree import ElementTree, Element
tree = ElementTree(Element('hello', {'beer': 'good'}))
with zipfile.ZipFile('/tmp/test.zip', 'w') as z:
with z.open('test.xml', 'w') as f:
tree.write(f, encoding='UTF-8', xml_declaration=True)
This also has the advantage that you don't store multiple copies of the tree in memory, which could be a relevant issue for large trees.

The only thing that is really missing in approach one is the XML declaration header. For ElementTree.write(...) you can use the xml_declaration, unfortunately for your version this isn't available in ElementTree.tostring yet.
Starting with Python 3.8, the ElementTree.tostring method does have a xml_declaration argument, see:
https://docs.python.org/3.8/library/xml.etree.elementtree.html
Even though that implementation is unavailable to you when using Python 3.6, you can easily copy the 3.8 implementation in your own Python file:
import io
def tostring(element, encoding=None, method=None, *,
xml_declaration=None, default_namespace=None,
short_empty_elements=True):
"""Generate string representation of XML element.
All subelements are included. If encoding is "unicode", a string
is returned. Otherwise a bytestring is returned.
*element* is an Element instance, *encoding* is an optional output
encoding defaulting to US-ASCII, *method* is an optional output which can
be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
sets the default XML namespace (for "xmlns").
Returns an (optionally) encoded string containing the XML data.
"""
stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
ElementTree(element).write(stream, encoding,
xml_declaration=xml_declaration,
default_namespace=default_namespace,
method=method,
short_empty_elements=short_empty_elements)
return stream.getvalue()
(See https://github.com/python/cpython/blob/v3.8.0/Lib/xml/etree/ElementTree.py#L1116)
In that case you can simply use approach one:
# create byte representation of elementtree
byte_representation = tostring(element=updated_tree.getroot(), encoding='UTF-8', method='xml', xml_declaration=True)
# write XML directly to zip
trgt1_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)

Related

XML Python: XML code is duplicated after saving to file

I have a code that in principle is to open the file content and wrap it with an additional import tag:
with open('oferta-empik.xml', 'r+', encoding='utf-8') as f:
xml = '<import>' + f.read() + '</import>'
print(xml)
f.write(xml)
f.close()
Unfortunately, after saving half the code is unchanged, and then the xml code already wrapped in the import is inserted into the file.
In total, the file duplicates the xml code where the first original is unchanged and then the same is appended to the end of the file wrapped with the import tag
ORIGINAL CODE:
<offers>
<offer>
<leadtime-to-ship>1</leadtime-to-ship>
<product-id-type>EAN</product-id-type>
<state>11</state>
<quantity>0</quantity>
<price>146</price>
<sku>B01.001.1.10</sku>
</offer>
</offer>
AFTER CODE:
<offers>
<offer>
<leadtime-to-ship>1</leadtime-to-ship>
<product-id-type>EAN</product-id-type>
<state>11</state>
<quantity>0</quantity>
<price>146</price>
<sku>B01.001.1.10</sku>
</offer>
</offer>
<import><offers>
<offer>
<leadtime-to-ship>1</leadtime-to-ship>
<product-id-type>EAN</product-id-type>
<state>11</state>
<quantity>0</quantity>
<price>146</price>
<sku>B01.001.1.10</sku>
</offer>
</offer></import>
the issue is that you're appending the new text (the new XML) to the end of the file. You're reading the entire file, and then write the modified XML at the end of that file.
There are two solutions:
Recommended: open the file for reading. Read the XML. Close it, and then open it for writing and write the entire thing (override the initial content).
Not Recommended: After you read, seek to the beginning of the file (with f.seek(0)) and write the new content. This solution is not recommended because if, at some point, the new content is shorter than the original content, the result will be inconsistent / messed-up.
I have a code that in principle is to open the file content and wrap it with an additional import tag
Your current approach is wrong. Don't open XML files as text files, don't treat XML as text. Always use a parser.
This is a lot better:
import xml.etree.ElementTree as ET
# 1: load current document and top level element
old_tree = ET.parse('oferta-empik.xml')
old_root = old_tree.getroot()
# 2: create <import> element to serve as new top level
new_root = ET.Element('import')
# 3: insert current document root ("wrap it in <import>")
new_root.insert(0, old_root)
# 4 make new ElementTree and write it to file
new_tree = ET.ElementTree(new_root)
with open('output.xml', 'wb') as f:
new_tree.write(f, encoding='utf8')
Compressed:
new_root = ET.Element('import')
new_root.insert(0, ET.parse('oferta-empik.xml').getroot())
with open('output.xml', 'wb') as f:
ET.ElementTree(new_root).write(f, encoding='utf8')

lxml.etree: Start tag expected, '<' not found, line 1, column 1

I want to take some simple xml files and convert them all to CSV in one go (though this code is just for one at a time). It looks to me like there are no official name spaces, but I'm not sure.
I have this code (I used one header, SubmittingSystemVendor, but I really want to write all of them to CSV:
import csv
import lxml.etree
x = r'C:\Users\...\jh944.xml'
with open('output.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow('SubmittingSystemVendor')
root = lxml.etree.fromstring(x)
writer.writerow(row)
Here is a sample of the XML file:
<?xml version="1.0" encoding="utf-8"?>
<EOYGeneralCollectionGroup SchemaVersionMajor="2014-2015" SchemaVersionMinor="1" CollectionId="157" SubmittingSystemName="MISTAR" SubmittingSystemVendor="WayneRESA" SubmittingSystemVersion="2014" xsi:noNamespaceSchemaLocation="http://cepi.state.mi.us/msdsxml/EOYGeneralCollection2014-20151.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<EOYGeneralCollection>
<SubmittingEntity>
<SubmittingEntityTypeCode>D</SubmittingEntityTypeCode>
<SubmittingEntityCode>82730</SubmittingEntityCode>
</SubmittingEntity>
The error is:
lxml.etree: Start tag expected, '<' not found, line 1, column 1
You are using lxml.etree.fromstring, but giving it a file path as the argument. This means it's trying to interpret "C:\Users...\jh944.xml" as the XML data to be parsed.
Instead, you want to open the file containing this XML. You can simply replace the call to fromstring with lxml.etree.parse, which will accept a filename or open file object as the argument.

Writing an XML header with LXML

I'm currently writing a script to convert a bunch of XML files from various encodings to a unified UTF-8.
I first try determining the encoding using LXML:
def get_source_encoding(self):
tree = etree.parse(self.inputfile)
encoding = tree.docinfo.encoding
self.inputfile.seek(0)
return (encoding or '').lower()
If that's blank, I try getting it from chardet:
def guess_source_encoding(self):
chunk = self.inputfile.read(1024 * 10)
self.inputfile.seek(0)
return chardet.detect(chunk).lower()
I then use codecs to convert the encoding of the file:
def convert_encoding(self, source_encoding, input_filename, output_filename):
chunk_size = 16 * 1024
with codecs.open(input_filename, "rb", source_encoding) as source:
with codecs.open(output_filename, "wb", "utf-8") as destination:
while True:
chunk = source.read(chunk_size)
if not chunk:
break;
destination.write(chunk)
Finally, I'm attempting to rewrite the XML header. If the XML header was originally
<?xml version="1.0"?>
or
<?xml version="1.0" encoding="windows-1255"?>
I'd like to transform it to
<?xml version="1.0" encoding="UTF-8"?>
My current code doesn't seem to work:
def edit_header(self, input_filename):
output_filename = tempfile.mktemp(suffix=".xml")
with open(input_filename, "rb") as source:
parser = etree.XMLParser(encoding="UTF-8")
tree = etree.parse(source, parser)
with open(output_filename, "wb") as destination:
tree.write(destination, encoding="UTF-8")
The file I'm currently testing has a header that doesn't specify the encoding. How can I make it output the header properly with the encoding specified?
Try:
tree.write(destination, xml_declaration=True, encoding='UTF-8')
From the API docs:
xml_declaration controls if an XML declaration should be added to the file. Use False for never, True for always, None for only if not US-ASCII or UTF-8 (default is None).
Sample from ipython:
In [15]: etree.ElementTree(etree.XML('<hi/>')).write(sys.stdout, xml_declaration=True, encoding='UTF-8')
<?xml version='1.0' encoding='UTF-8'?>
<hi/>
On reflection, I think you trying way too hard. lxml automatically detects the encoding and correctly parses the file according to that encoding.
So all you really have to do (at least in Python2.7) is:
def convert_encoding(self, source_encoding, input_filename, output_filename):
tree = etree.parse(input_filename)
with open(output_filename, 'w') as destination:
tree.write(destination, encoding='utf-8', xml_declaration=True)

lxml not performing xslt transform

With this code:
from lxml import etree
with open( 'C:\\Python33\\projects\\xslt', 'r' ) as xslt, open( 'C:\\Python33\\projects\\result', 'a+' ) as result, open( 'C:\\Python33\\projects\\xml', 'r' ) as xml:
s_xml = xml.read()
s_xslt = xslt.read()
transform = etree.XSLT(etree.XML(s_xslt))
out = transform(etree.XML(s_xml))
result.write(out)
I get this error:
Traceback (most recent call last):
File "<pyshell#7>", line 1, in <module>
from projects.xslt_transform import trans
File ".\projects\xslt_transform.py", line 17, in <module>
transform = etree.XSLT(etree.XML(s_xslt))
File "xslt.pxi", line 409, in lxml.etree.XSLT.__init__ (src\lxml\lxml.etree.c:150256)
lxml.etree.XSLTParseError: Invalid expression
this couple xml/xslt files works with other tools.
Also I had to get rid of the encoding attribute in the top declarations for both files in order not to get:
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
can it be related ?
EDIT:
this does not work either (i get the same error):
with open( 'C:\\Python33\\projects\\xslt', 'r',encoding="utf-8" ) as xslt, open( 'C:\\Python33\\projects\\result', 'a+',encoding="utf-8" ) as result, open( 'C:\\Python33\\projects\\xml', 'r',encoding="utf-8" ) as xml:
s_xml = etree.parse(BytesIO(bytes(xml.read(),'UTF-8')))
s_xslt = etree.parse(BytesIO(bytes(xslt.read(),'UTF-8')))
transform = etree.XSLT(s_xslt)
out = transform(s_xml)
print(out.tostring())
reading lxml source code: this returns an exception:
xslt.xsltParseStylesheetDoc(c_doc)
so it seems an actual parse error. Can it be namespace related ?
EDIT SOLVED:
s_xml = etree.parse(xml.read())
s_xslt = etree.parse(xslt.read())
thanks tomalak
Parsing XML is more complicated than "open a text file, stuff the resulting string into etree".
XML files are serialized representations of a DOM tree. They are not to be handled as text even though they come in the shape of a text file. They come in multiple byte encodings and finding out which encoding a certain file uses is anything but trivial.
XML parsers have proper detection mechanisms built in and therefore they should be used to open XML files. The the basic open() + read() calls are not enough to correctly handle the file contents.
lxml.etree provides the parse() function that can accept a number of argument types:
an open file object (make sure to open it in binary mode)
a file-like object that has a .read(byte_count) method returning a byte string on each call
a filename string
an HTTP or FTP URL string
and then will correctly parse the associated document back into a DOM tree.
Your code should look more like this:
from lxml import etree
f_xsl = 'C:\\Python33\\projects\\xslt'
f_xml = 'C:\\Python33\\projects\\xml'
f_out = 'C:\\Python33\\projects\\result'
transform = etree.XSLT(etree.parse(f_xsl))
result = transform(etree.parse(f_xml))
result.write(f_out)

How to write XML declaration using xml.etree.ElementTree

I am generating an XML document in Python using an ElementTree, but the tostring function doesn't include an XML declaration when converting to plaintext.
from xml.etree.ElementTree import Element, tostring
document = Element('outer')
node = SubElement(document, 'inner')
node.NewValue = 1
print tostring(document) # Outputs "<outer><inner /></outer>"
I need my string to include the following XML declaration:
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
However, there does not seem to be any documented way of doing this.
Is there a proper method for rendering the XML declaration in an ElementTree?
I am surprised to find that there doesn't seem to be a way with ElementTree.tostring(). You can however use ElementTree.ElementTree.write() to write your XML document to a fake file:
from io import BytesIO
from xml.etree import ElementTree as ET
document = ET.Element('outer')
node = ET.SubElement(document, 'inner')
et = ET.ElementTree(document)
f = BytesIO()
et.write(f, encoding='utf-8', xml_declaration=True)
print(f.getvalue()) # your XML file, encoded as UTF-8
See this question. Even then, I don't think you can get your 'standalone' attribute without writing prepending it yourself.
I would use lxml (see http://lxml.de/api.html).
Then you can:
from lxml import etree
document = etree.Element('outer')
node = etree.SubElement(document, 'inner')
print(etree.tostring(document, xml_declaration=True))
If you include the encoding='utf8', you will get an XML header:
xml.etree.ElementTree.tostring writes a XML encoding declaration with encoding='utf8'
Sample Python code (works with Python 2 and 3):
import xml.etree.ElementTree as ElementTree
tree = ElementTree.ElementTree(
ElementTree.fromstring('<xml><test>123</test></xml>')
)
root = tree.getroot()
print('without:')
print(ElementTree.tostring(root, method='xml'))
print('')
print('with:')
print(ElementTree.tostring(root, encoding='utf8', method='xml'))
Python 2 output:
$ python2 example.py
without:
<xml><test>123</test></xml>
with:
<?xml version='1.0' encoding='utf8'?>
<xml><test>123</test></xml>
With Python 3 you will note the b prefix indicating byte literals are returned (just like with Python 2):
$ python3 example.py
without:
b'<xml><test>123</test></xml>'
with:
b"<?xml version='1.0' encoding='utf8'?>\n<xml><test>123</test></xml>"
xml_declaration Argument
Is there a proper method for rendering the XML declaration in an ElementTree?
YES, and there is no need of using .tostring function. According to ElementTree Documentation, you should create an ElementTree object, create Element and SubElements, set the tree's root, and finally use xml_declaration argument in .write function, so the declaration line is included in output file.
You can do it this way:
import xml.etree.ElementTree as ET
tree = ET.ElementTree("tree")
document = ET.Element("outer")
node1 = ET.SubElement(document, "inner")
node1.text = "text"
tree._setroot(document)
tree.write("./output.xml", encoding = "UTF-8", xml_declaration = True)
And the output file is:
<?xml version='1.0' encoding='UTF-8'?>
<outer><inner>text</inner></outer>
I encounter this issue recently, after some digging of the code, I found the following code snippet is definition of function ElementTree.write
def write(self, file, encoding="us-ascii"):
assert self._root is not None
if not hasattr(file, "write"):
file = open(file, "wb")
if not encoding:
encoding = "us-ascii"
elif encoding != "utf-8" and encoding != "us-ascii":
file.write("<?xml version='1.0' encoding='%s'?>\n" %
encoding)
self._write(file, self._root, encoding, {})
So the answer is, if you need write the XML header to your file, set the encoding argument other than utf-8 or us-ascii, e.g. UTF-8
Easy
Sample for both Python 2 and 3 (encoding parameter must be utf8):
import xml.etree.ElementTree as ElementTree
tree = ElementTree.ElementTree(ElementTree.fromstring('<xml><test>123</test></xml>'))
root = tree.getroot()
print(ElementTree.tostring(root, encoding='utf8', method='xml'))
From Python 3.8 there is xml_declaration parameter for that stuff:
New in version 3.8: The xml_declaration and default_namespace
parameters.
xml.etree.ElementTree.tostring(element, encoding="us-ascii",
method="xml", *, xml_declaration=None, default_namespace=None,
short_empty_elements=True) Generates a string representation of an XML
element, including all subelements. element is an Element instance.
encoding 1 is the output encoding (default is US-ASCII). Use
encoding="unicode" to generate a Unicode string (otherwise, a
bytestring is generated). method is either "xml", "html" or "text"
(default is "xml"). xml_declaration, default_namespace and
short_empty_elements has the same meaning as in ElementTree.write().
Returns an (optionally) encoded string containing the XML data.
Sample for Python 3.8 and higher:
import xml.etree.ElementTree as ElementTree
tree = ElementTree.ElementTree(ElementTree.fromstring('<xml><test>123</test></xml>'))
root = tree.getroot()
print(ElementTree.tostring(root, encoding='unicode', method='xml', xml_declaration=True))
The minimal working example with ElementTree package usage:
import xml.etree.ElementTree as ET
document = ET.Element('outer')
node = ET.SubElement(document, 'inner')
node.text = '1'
res = ET.tostring(document, encoding='utf8', method='xml').decode()
print(res)
the output is:
<?xml version='1.0' encoding='utf8'?>
<outer><inner>1</inner></outer>
Another pretty simple option is to concatenate the desired header to the string of xml like this:
xml = (bytes('<?xml version="1.0" encoding="UTF-8"?>\n', encoding='utf-8') + ET.tostring(root))
xml = xml.decode('utf-8')
with open('invoice.xml', 'w+') as f:
f.write(xml)
I would use ET:
try:
from lxml import etree
print("running with lxml.etree")
except ImportError:
try:
# Python 2.5
import xml.etree.cElementTree as etree
print("running with cElementTree on Python 2.5+")
except ImportError:
try:
# Python 2.5
import xml.etree.ElementTree as etree
print("running with ElementTree on Python 2.5+")
except ImportError:
try:
# normal cElementTree install
import cElementTree as etree
print("running with cElementTree")
except ImportError:
try:
# normal ElementTree install
import elementtree.ElementTree as etree
print("running with ElementTree")
except ImportError:
print("Failed to import ElementTree from any known place")
document = etree.Element('outer')
node = etree.SubElement(document, 'inner')
print(etree.tostring(document, encoding='UTF-8', xml_declaration=True))
This works if you just want to print. Getting an error when I try to send it to a file...
import xml.dom.minidom as minidom
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
def prettify(elem):
rough_string = ET.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
Including 'standalone' in the declaration
I didn't found any alternative for adding the standalone argument in the documentation so I adapted the ET.tosting function to take it as an argument.
from xml.etree import ElementTree as ET
# Sample
document = ET.Element('outer')
node = ET.SubElement(document, 'inner')
et = ET.ElementTree(document)
# Function that you need
def tostring(element, declaration, encoding=None, method=None,):
class dummy:
pass
data = []
data.append(declaration+"\n")
file = dummy()
file.write = data.append
ET.ElementTree(element).write(file, encoding, method=method)
return "".join(data)
# Working example
xdec = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?>"""
xml = tostring(document, encoding='utf-8', declaration=xdec)

Categories

Resources