lxml create CDATA element - python

I am trying to create CDATA element as per https://lxml.de/apidoc/lxml.etree.html#lxml.etree.CDATA
The simplified version of my code looks like this:
description = ET.SubElement(item, "description")
description.text = CDATA('test')
But when I later try to convert it to string:
xml_str = ET.tostring(self.__root, xml_declaration=True).decode()
I get an exception
cannot serialize <lxml.etree.CDATA object at 0x122c30ef0> (type CDATA)
Could you advise me what am I missing?
Here is a simple example:
import xml.etree.cElementTree as ET
from lxml.etree import CDATA
root = ET.Element('rss')
root.set("version", "2.0")
description = ET.SubElement(root, "description")
description.text = CDATA('test')
xml_str = ET.tostring(root, xml_declaration=True).decode()
print(xml_str)

lxml.etree and xml.etree are two different libraries; you should pick one and stick with it, rather than using both and trying to pass objects created by one to the other.
A working example, using lxml only:
import lxml.etree as ET
from lxml.etree import CDATA
root = ET.Element('rss')
root.set("version", "2.0")
description = ET.SubElement(root, "description")
description.text = CDATA('test')
xml_str = ET.tostring(root, xml_declaration=True).decode()
print(xml_str)
You can run this yourself at https://replit.com/#CharlesDuffy2/JovialMediumLeadership

Related

Updating xml tag in python using lxml Etree

I'm trying to update a single tag in my xml file with another value. I'm using lxml module in python.
bplocation = os.getcwd()+"/apiproxy/proxies";
tree = lxml.etree.parse(bplocation+'/default.xml');
root = tree.getroot();
update = lxml.etree.SubElement(root, "BasePath");
update.text = "new basepath";
root.SubElement('BasePath',update);
pretty = lxml.etree.tostring(root, encoding="unicode", pretty_print=True);
f = open("test.xml", "w")
f.write(pretty)
f.close()
I'm getting AttributeError: 'lxml.etree._Element' object has no attribute 'SubElement' error.
I just need the tag updated in xml.
Below is the xml.
<ProxyEndpoint name="default">
<HTTPProxyConnection>
<BasePath>/v2/test</BasePath>
<VirtualHost>https_vhost_sslrouter</VirtualHost>
<VirtualHost>secure</VirtualHost>
</HTTPProxyConnection>
</ProxyEndpoint>
SubElement() (a function in the lxml.etree module) creates a new element, but that is not necessary.
Just get a reference to the existing <BasePath> element and update its text content.
from lxml import etree
tree = etree.parse("default.xml")
update = tree.find("//BasePath")
update.text = "new basepath"
pretty = etree.tostring(tree, encoding="unicode", pretty_print=True)
print(pretty)
Output:
<ProxyEndpoint name="default">
<HTTPProxyConnection>
<BasePath>new basepath</BasePath>
<VirtualHost>https_vhost_sslrouter</VirtualHost>
<VirtualHost>secure</VirtualHost>
</HTTPProxyConnection>
</ProxyEndpoint>

Parsing XML Attributes with Python

I am trying to parse out all the green highlighted attributes (some sensitive things have been blacked out), I have a bunch of XML files all with similar formats, I already know how to loop through all of them individually them I am having trouble parsing out the specific attributes though.
XML Document
I need the text in the attributes: name="text1"
from
project logLevel="verbose" version="2.0" mainModule="Main" name="text1">
destinationDir="/text2" from
put label="Put Files" destinationDir="/Trigger/FPDMMT_INBOUND">
destDir="/text3" from
copy disabled="false" version="1.0" label="Archive Files" destDir="/text3" suffix="">
I am using
import csv
import os
import re
import xml.etree.ElementTree as ET
tree = ET.parse(XMLfile_path)
item = tree.getroot()[0]
root = tree.getroot()
print (item.get("name"))
print (root.get("name"))
This outputs:
Main
text1
The item.get pulls the line at index [0] which is the first line root in the tree which is <module
The root.get pulls from the first line <project
I know there's a way to search for exactly the right part of the root/tree with something like:
test = root.find('./project/module/ftp/put')
print (test.get("destinationDir"))
I need to be able to jump directly to the thing I need and output the attributes I need.
Any help would be appreciated
Thanks.
Simplified copy of your XML:
xml = '''<project logLevel="verbose" version="2.0" mainModule="Main" name="hidden">
<module name="Main">
<createWorkspace version="1.0"/>
<ftp version="1.0" label="FTP connection to PRD">
<put label="Put Files" destinationDir="destination1">
</put>
</ftp>
<ftp version="1.0" label="FTP connection to PRD">
<put label="Put Files" destinationDir="destination2">
</put>
</ftp>
<copy disabled="false" destDir="destination3">
</copy>
</module>
</project>
'''
# solution using ETree
from xml.etree import ElementTree as ET
root = ET.fromstring(xml)
name = root.get('name')
ftp_destination_dir1 = root.findall('./module/ftp/put')[0].get('destinationDir')
ftp_destination_dir2 = root.findall('./module/ftp/put')[1].get('destinationDir')
copy_destination_dir = root.find('./module/copy').get('destDir')
print(name)
print(ftp_destination_dir1)
print(ftp_destination_dir2)
print(copy_destination_dir)
# solution using lxml
from lxml import etree as et
root = et.fromstring(xml)
name = root.get('name')
ftp_destination_dirs = root.xpath('./module/ftp/put/#destinationDir')
copy_destination_dir = root.xpath('./module/copy/#destDir')[0]
print(name)
print(ftp_destination_dirs[0])
print(ftp_destination_dirs[1])
print(copy_destination_dir)

Xpath in Python . Getting SyntaxError("invalid predicate")

import xml.etree.ElementTree as ET
tree = ET.parse('test.xml')
xpathobjects = tree.findall(".//BuildingNodeBase[name = 'Building name']")
I am wanting to pull a BuildingNodeBase with a child tag name that has value Building name.
But Getting:
SyntaxError("invalid predicate")
The XPath support in ElementTree is limited, but your type of expression is supported. It's just that you need to remove the extra spaces around the =:
.//BuildingNodeBase[name='Building name']
I use lxml but I guess you can adopt this for your use:
from lxml import etree
tree = etree.parse('test.xml')
xpathobjects = tree.xpath(".//BuildingNodeBase[#name = 'Building name']")

Restore CDATA during lxml serialization

I know that I can preserve CDATA sections during XML parsing, using the following:
from lxml import etree
parser = etree.XMLParser(strip_cdata=False)
root = etree.XML('<root><![CDATA[test]]></root>', parser)
See APIs specific to lxml.etree
But, is there a simple way to "restore" CDATA section during serialization?
For example, by specifying a list of tag names…
For instance, I want to turn:
<CONFIG>
<BODY>This is a <message>.</BODY>
</CONFIG>
to:
<CONFIG>
<BODY><![CDATA[This is a <message>.]]></BODY>
</CONFIG>
Just by telling that BODY should contains CDATA…
Something like this?
from lxml import etree
parser = etree.XMLParser(strip_cdata=True)
root = etree.XML('<root><x><![CDATA[<test>]]></x></root>', parser)
print etree.tostring(root)
for elem in root.findall('x'):
elem.text = etree.CDATA(elem.text)
print etree.tostring(root)
Produces:
<root><x><test></x></root>
<root><x><![CDATA[<test>]]></x></root>

Parsing XML using Python minidom

<PacketHeader>
<HeaderField>
<name>number</name>
<dataType>int</dataType>
</HeaderField>
</PacketHeader>
This is my small XML file and I want to extract out the text which is within the name tag.
Here is my code snippet:-
from xml.dom import minidom
from xml.dom.minidom import parse
xmldoc = minidom.parse('sample.xml')
packetHeader = xmldoc.getElementsByTagName("PacketHeader")
headerField = packetHeader.getElementsByTagName("HeaderField")
for field in headerField:
getFieldName = field.getElementsByTagName("name")
print getFieldName
But I am getting the location but not the text.
from xml.dom import minidom
from xml.dom.minidom import parse
xmldoc = minidom.parse('sample.xml')
# find the name element, if found return a list, get the first element
name_element = xmldoc.getElementsByTagName("name")[0]
# this will be a text node that contains the actual text
text_node = name_element.childNodes[0]
# get text
print text_node.data
Please check this.
Update
BTW i suggest you ElementTree, Below is the code snippet using ElementTree which is doing samething as the above minidom code
import elementtree.ElementTree as ET
tree = ET.parse("sample.xml")
# the tree root is the toplevel `PacketHeader` element
print tree.findtext("HeaderField/name")
A small variant of the accepted and correct answer above is:
from xml.dom import minidom
xmldoc = minidom.parse('fichier.xml')
name_element = xmldoc.getElementsByTagName('name')[0]
print name_element.childNodes[0].nodeValue
This simply uses nodeValue instead of its alias data

Categories

Resources