python xml pretty print not working - python

I am changing an some xml by adding some nodes and values from a list. I can successfully create all the new tags and values, I am creating them between the contributors tags, but when I save the xml out to a new file, the tags I create are all on one line. Here is a sample of my code:
templateXml = """<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<package>
<delivery_type>new</delivery_type>
<feature>
<feature_type>Movie</feature_type>
<contributors>
</contributors>
</package>"""
from lxml import etree
tree = etree.fromstring(templateXml)
node_video = tree.xpath('//feature/contributors')[0]
for cast in castList:
pageElement = etree.SubElement(node_video, 'contributor')
node_video1 = tree.xpath('//feature/contributors/contributor')[0]
pageElement.attrib['type'] = 'cast'
pageElement1 = etree.SubElement(pageElement, 'name')
pageElement1.text = cast.text
pageElement2 = etree.SubElement(pageElement, 'role')
pageElement2.text = "actor"
xmlFileOut = '/Users/User1/Desktop/Python/Done.xml'
with open(xmlFileOut, "w") as f:
f.write(etree.tostring(tree, pretty_print = True, xml_declaration = True, encoding='UTF-8', standalone="yes"))
Here is saved xml file:
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<package>
<delivery_type>new</delivery_type>
<feature>
<feature_type>Movie</feature_type>
<contributors>
<contributor type="cast"><name>John Doe</name><role>actor</role></contributor><contributor type="cast"><name>Another Actors name</name><role>actor</role></contributor><contributor type="cast"><name>Jane Doe</name><role>actor</role></contributor><contributor type="cast"><name>John Smith</name><role>actor</role></contributor></contributors>
</package>
I have solved this issue when opening an xml file to work on using the below code:
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True) # makes pretty print work
path3 = 'path_to_xml_file'
open(path3)
tree = etree.parse(path3, parser)
root = tree.getroot()
tree.write(xmlFileOut, pretty_print = True, xml_declaration = True, encoding = 'UTF-8')
This works, but how do I get it to work with a string xml?

Taken from http://ruslanspivak.com/2014/05/12/how-to-pretty-print-xml-with-lxml/
import StringIO
import lxml.etree as etree
def prettify(xml_text):
"""Pretty prints xml."""
parser = etree.XMLParser(remove_blank_text=True)
file_obj = StringIO.StringIO(xml_text)
tree = etree.parse(file_obj, parser)
return etree.tostring(tree, pretty_print=True)

A simple solution might be to use StringIO:
from StringIO import StringIO
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(StringIO(templateXml), parser)

Related

Python - multiple logs/structures in the same xml file using ElementTree

I wrote an xml file with ElementTree. The problem i'm facing is that I want to write multiple logs in the same xml file.
The code:
import xml.etree.cElementTree as ET
#XML
root = ET.Element('LOG')
DATE = ET.SubElement(root, 'DATE')
DATE.text = "child_1"
TIME = ET.SubElement(root, 'TIME')
TIME.text = "child_2"
CC = ET.SubElement(root, 'CC')
CC.text = "child_3"
AMOUNT = ET.SubElement(root, 'AMOUNT')
AMOUNT.text = "child_4"
tree = ET.ElementTree(root)
#Generating XML
tree.write("file_name.xlm")
#Print
print(open("file_name.xlm").read())
Current output example:
<LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG>
Output that I want:
<LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG><LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG><LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG>...
What you want is just not how xml works. xml documents have one root node, and just one. The closest you can get is by wrapping your <LOG> entries in a container:
<ALL_LOGS>
<LOG>....</LOG>
<LOG>....</LOG>
</ALL_LOGS>

adding elements in xml using python

how do i add an element to my attrib ?
i know how to add with text:
ET.subElement(root[2],'resource')
for x in root.iter('resource'):
but what i'm looking for is:
<resources>
<resource identifier="presentation_3" type="webcontent" href="questions.html" adlcp:scormtype="sco">
<file href="questions.html"/>
</resource>
</resources>
below is my code:
import xml.etree.ElementTree as ET
xmlfile = "imsmanifest.xml"
tree = ET.parse(xmlfile)
root = tree.getroot()
ET.SubElement(root[2],'file').set('href','index.html')
tree.write('new.xml')

How to extract values from xml file with namespaces?

I have the xml file shown below, that has namespaces, for which I'm trying to extract the values of Node24
My current code is below, that is not printing anything:
import xml.etree.ElementTree as ET
filename = 'ifile.xml'
tree = ET.parse(filename)
root = tree.getroot()
for neighbor in root.iter('Node24'):
print(neighbor)
My expected output would be:
03-c34ko
04-c64ko
07-c54ko
The is the ifile.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<data-main-43:DATAMAINXZ123 xmlns="https://example.com/DATA-MAIN-XZ123" xmlns:data-gen="https://example.com/DATA-GEN" xmlns:data-main-43="https://example.com/DATA-MAIN-XZ123" xmlns:xsi="http://www.w3.org/2011/XMLSchema-instance" xsi:schemaLocation="https://example.com/DATA-MAIN-XZ123 data-main-ir21-12.1.xsd">
<MAINXZ123FileHeader>
<DATAGenSchemaVersion>2.4</DATAGenSchemaVersion>
<DATAMAINXZ123SchemaVersion>12.1</DATAMAINXZ123SchemaVersion>
</MAINXZ123FileHeader>
<Node1>
<Node2>WTRT DDK</Node2>
<Node3>XYZW</Node3>
<Node4>
<Node5>
<Node6>XYZW882</Node6>
<Node5Type>Ter</Node5Type>
<Node5Data>
<Node9>
<Node10>
<Node11>2019-02-18</Node11>
<Node12>
<Node13>
<Node14>
<Node15>Ermso</Node15>
<Node16>
<PrimaryNode16>
<Node18>19.32</Node18>
<Node18>12.11</Node18>
</PrimaryNode16>
<SecondaryNode16>
<Node18>82.97</Node18>
<Node18>12.41</Node18>
</SecondaryNode16>
</Node16>
<Node20>Muuatippw</Node20>
</Node14>
</Node13>
</Node12>
<Node21>
<Node22>
<Node23>
<Node24>03-c34ko</Node24>
<Node24>04-c64ko</Node24>
<Node24>07-c54ko</Node24>
</Node23>
<Node26Node22EdgeAgent>
<Node26>jjkksonem</Node26>
<PrimaryNode18DEANode26>
<Node18>2.40</Node18>
</PrimaryNode18DEANode26>
</Node26Node22EdgeAgent>
</Node22>
</Node21>
<Node28>
<Node29>
<Node30>false</Node30>
<Node31>true</Node31>
</Node29>
</Node28>
</Node10>
</Node9>
</Node5Data>
</Node5>
</Node4>
</Node1>
</data-main-43:DATAMAINXZ123>
How can I do this? Thanks in advance.
Like the duplicate mzjn referenced, just add the namespace uri to the element name...
import xml.etree.ElementTree as ET
filename = 'ifile.xml'
tree = ET.parse(filename)
root = tree.getroot()
for neighbor in root.iter('{https://example.com/DATA-MAIN-XZ123}Node24'):
print(neighbor.text)
Note: I also added .text to neighbor so you'd get the requested result.
I'm using regular expression so this is an alternative answer.
I converted the xml into string then search for all strings between Node24
import xml.etree.ElementTree as ET
import re
filename = 'ifile.xml'
tree = ET.parse(filename)
root = tree.getroot()
xml_str = ET.tostring(root)
for s in re.findall(r'ns0:Node24>(.*?)</ns0:Node24', str(xml_str)):
print(s)
Result:
03-c34ko
04-c64ko
07-c54ko

case insensitive xml and python

I got this piece of code and I am trying to read all the 'ref' 'href' tags. I am not sure how to make this to be case insensitive as some of my xml files have REF or Ref or ref.
Any suggestions?
f = urllib.urlopen(url)
tree = ET.parse(f)
root = tree.getroot()
for child in root.iter('ref'):
t = child.get('href')
if t not in self.href:
self.href.append(t)
print self.href[-1]
You can normalize tags and attributes by converting them to lowercase using the functions below as a step of preprocessing:
import xml.etree.ElementTree as ET
f = urllib.urlopen(url)
tree = ET.parse(f)
root = tree.getroot()
def normalize_tags(root):
root.tag = root.tag.lower()
for child in root:
normalize_tags(child)
def normalize_attr(root):
for attr,value in root.attrib.items():
norm_attr = attr.lower()
if norm_attr != attr:
root.set(norm_attr,value)
root.attrib.pop(attr)
for child in root:
normalize_attr(child)
normalize_tags(root)
normalize_attr(root)
print(ET.tostring(root))
The following should help
f = urllib.urlopen(url)
tree = ET.parse(f)
root = tree.getroot()
for child in root:
if child.tag.lower() == 'ref':
t = child.attribute.get('href')
if t not in self.href:
self.href.append(t)
print self.href[-1]
If you are using lxml then one option is to use XPath with regular expressions through XSLT extensions (https://stackoverflow.com/a/2756994/2997179):
root.xpath("./*[re:test(local-name(), '(?i)href')]",
namespaces={"re": "http://exslt.org/regular-expressions"})

Python: specify XMLNS on xml.etree elements

in my Python code I'm currently using the xml.etree library to create a tree and then dump it to an XML string. Unfortunately I can't use modules other than the ones in the Python Standard Libraries to do that.
Here is my code:
import xml.etree.ElementTree as ET
def dump_to_XML():
root_node = ET.Element("root")
c1_node = ET.SubElement(root_node, "child1")
c1_node.text = "foo"
c2_node = ET.SubElement(root_node, "child2")
gc1_node = ET.SubElement(c2_node, "grandchild1")
gc1_node.text = "bar"
return ET.tostring(root_node, encoding='utf8', method='xml')
which gives the string:
<?xml version='1.0' encoding='utf8'?>
<root>
<child1>foo</child1>
<child2>
<grandchild1>bar</grandchild1>
</child2>
</root>
Now, I have two schema files located - say - http://myhost.com/p.xsd and http://myhost.com/q.xsd, I want the output string to be turned into:
<?xml version='1.0' encoding='UTF-8'?>
<root xmlns:p="http://myhost.com/p.xsd" xmlns:q="http://myhost.com/q.xsd">
<p:child1>foo</p:child1>
<p:child2>
<q:grandchild1>bar</q:grandchild1>
</p:child2>
</root>
How can I leverage the etree library in order to achieve that?
Thanks in advance
Here we go:
import xml.etree.ElementTree as ET
xmlns_uris = {'p': 'http://myhost.com/p.xsd',
'q': 'http://myhost.com/q.xsd'}
def dump_to_XML():
root_node = ET.Element("root")
c1_node = ET.SubElement(root_node, "child1")
c1_node.text = "foo"
c2_node = ET.SubElement(root_node, "child2")
gc1_node = ET.SubElement(c2_node, "grandchild1")
gc1_node.text = "bar"
annotate_with_XMLNS_prefixes(gc1_node, 'q', False)
annotate_with_XMLNS_prefixes(root_node, 'p')
add_XMLNS_attributes(root_node, xmlns_uris)
return ET.tostring(root_node, encoding='UTF-8', method='xml')
def annotate_with_XMLNS_prefixes(tree, xmlns_prefix, skip_root_node=True):
if not ET.iselement(tree):
tree = tree.getroot()
iterator = tree.iter()
if skip_root_node: # Add XMLNS prefix also to the root node?
iterator.next()
for e in iterator:
if not ':' in e.tag:
e.tag = xmlns_prefix + ":" + e.tag
def add_XMLNS_attributes(tree, xmlns_uris_dict):
if not ET.iselement(tree):
tree = tree.getroot()
for prefix, uri in xmlns_uris_dict.items():
tree.attrib['xmlns:' + prefix] = uri
Executing: print dump_to_XML() gives:
<?xml version='1.0' encoding='UTF-8'?>
<root xmlns:p="http://myhost.com/p.xsd" xmlns:q="http://myhost.com/q.xsd">
<p:child1>foo</p:child1>
<p:child2>
<q:grandchild1>bar</q:grandchild1>
</p:child2>
</root>
from lxml import etree
xmlns_uris = {'p': 'http://myhost.com/p.xsd', 'q': 'http://myhost.com/q.xsd'}
root = etree.Element('root', nsmap = xmlns_uris)
child1 = etree.SubElement(root,'{%s}child1'%xmlns_uris['p'])
child1.text = 'foo'
child2 = etree.SubElement(root,'{%s}child2'%xmlns_uris['p'])
grandchild1 = etree.SubElement(child2,'{%s}grandchild1'%xmlns_uris['q'])
grandchild1.text = 'bar'
print(etree.tostring(root, pretty_print=True, encoding='UTF-8', xml_declaration=True).decode('cp1251'))

Categories

Resources