Restore CDATA during lxml serialization - python

I know that I can preserve CDATA sections during XML parsing, using the following:
from lxml import etree
parser = etree.XMLParser(strip_cdata=False)
root = etree.XML('<root><![CDATA[test]]></root>', parser)
See APIs specific to lxml.etree
But, is there a simple way to "restore" CDATA section during serialization?
For example, by specifying a list of tag names…
For instance, I want to turn:
<CONFIG>
<BODY>This is a <message>.</BODY>
</CONFIG>
to:
<CONFIG>
<BODY><![CDATA[This is a <message>.]]></BODY>
</CONFIG>
Just by telling that BODY should contains CDATA…

Something like this?
from lxml import etree
parser = etree.XMLParser(strip_cdata=True)
root = etree.XML('<root><x><![CDATA[<test>]]></x></root>', parser)
print etree.tostring(root)
for elem in root.findall('x'):
elem.text = etree.CDATA(elem.text)
print etree.tostring(root)
Produces:
<root><x><test></x></root>
<root><x><![CDATA[<test>]]></x></root>

Related

Parsing XML Attributes with Python

I am trying to parse out all the green highlighted attributes (some sensitive things have been blacked out), I have a bunch of XML files all with similar formats, I already know how to loop through all of them individually them I am having trouble parsing out the specific attributes though.
XML Document
I need the text in the attributes: name="text1"
from
project logLevel="verbose" version="2.0" mainModule="Main" name="text1">
destinationDir="/text2" from
put label="Put Files" destinationDir="/Trigger/FPDMMT_INBOUND">
destDir="/text3" from
copy disabled="false" version="1.0" label="Archive Files" destDir="/text3" suffix="">
I am using
import csv
import os
import re
import xml.etree.ElementTree as ET
tree = ET.parse(XMLfile_path)
item = tree.getroot()[0]
root = tree.getroot()
print (item.get("name"))
print (root.get("name"))
This outputs:
Main
text1
The item.get pulls the line at index [0] which is the first line root in the tree which is <module
The root.get pulls from the first line <project
I know there's a way to search for exactly the right part of the root/tree with something like:
test = root.find('./project/module/ftp/put')
print (test.get("destinationDir"))
I need to be able to jump directly to the thing I need and output the attributes I need.
Any help would be appreciated
Thanks.
Simplified copy of your XML:
xml = '''<project logLevel="verbose" version="2.0" mainModule="Main" name="hidden">
<module name="Main">
<createWorkspace version="1.0"/>
<ftp version="1.0" label="FTP connection to PRD">
<put label="Put Files" destinationDir="destination1">
</put>
</ftp>
<ftp version="1.0" label="FTP connection to PRD">
<put label="Put Files" destinationDir="destination2">
</put>
</ftp>
<copy disabled="false" destDir="destination3">
</copy>
</module>
</project>
'''
# solution using ETree
from xml.etree import ElementTree as ET
root = ET.fromstring(xml)
name = root.get('name')
ftp_destination_dir1 = root.findall('./module/ftp/put')[0].get('destinationDir')
ftp_destination_dir2 = root.findall('./module/ftp/put')[1].get('destinationDir')
copy_destination_dir = root.find('./module/copy').get('destDir')
print(name)
print(ftp_destination_dir1)
print(ftp_destination_dir2)
print(copy_destination_dir)
# solution using lxml
from lxml import etree as et
root = et.fromstring(xml)
name = root.get('name')
ftp_destination_dirs = root.xpath('./module/ftp/put/#destinationDir')
copy_destination_dir = root.xpath('./module/copy/#destDir')[0]
print(name)
print(ftp_destination_dirs[0])
print(ftp_destination_dirs[1])
print(copy_destination_dir)

Extracting Raw XML via lxml etree

I'm trying to extract raw XML from an XML file.
So if my data is:
<xml>
... Lots of XML ...
<getThese>
<clonedKey>1</clonedKey>
<clonedKey>2</clonedKey>
<clonedKey>3</clonedKey>
<randomStuff>this is a sentence</randomStuff>
</getThese>
<getThese>
<clonedKey>6</clonedKey>
<clonedKey>8</clonedKey>
<clonedKey>3</clonedKey>
<randomStuff>more words</randomStuff>
</getThese>
... Lots of XML ...
</xml>
I can get the keys I want easily using etree:
from lxml import etree
search_me = etree.fromstring(xml_str)
search_me.findall('./xml/getThis')
But how do I get the actual content as raw XML? All the stuff I can see in the docs is for getting elements/text/attributes rather than the raw XML.
My desired output would be a list with two elements:
["<getThese>
<clonedKey>1</clonedKey>
<clonedKey>2</clonedKey>
<clonedKey>3</clonedKey>
<randomStuff>this is a sentence</randomStuff>
</getThese>",
"<getThese>
<clonedKey>6</clonedKey>
<clonedKey>8</clonedKey>
<clonedKey>3</clonedKey>
<randomStuff>more words</randomStuff>
</getThese>"]
You should be able to use tostring() to serialize the XML.
Example...
from lxml import etree
xml = """
<xml>
<getThese>
<clonedKey>1</clonedKey>
<clonedKey>2</clonedKey>
<clonedKey>3</clonedKey>
<randomStuff>this is a sentence</randomStuff>
</getThese>
<getThese>
<clonedKey>6</clonedKey>
<clonedKey>8</clonedKey>
<clonedKey>3</clonedKey>
<randomStuff>more words</randomStuff>
</getThese>
</xml>
"""
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.fromstring(xml, parser=parser)
elems = []
for elem in tree.xpath("getThese"):
elems.append(etree.tostring(elem).decode())
print(elems)
Printed output...
['<getThese><clonedKey>1</clonedKey><clonedKey>2</clonedKey><clonedKey>3</clonedKey><randomStuff>this is a sentence</randomStuff></getThese>', '<getThese><clonedKey>6</clonedKey><clonedKey>8</clonedKey><clonedKey>3</clonedKey><randomStuff>more words</randomStuff></getThese>']

Reading xml with lxml lib geting strange string from xmlns tag

I am writing program to work on xml file and change it. But when I try to get to any part of it I get some extra part.
My xml file:
<?xml version="1.0" encoding="UTF-8"?>
<Package xmlns="http://soap.sforce.com/2006/04/metadata">
<types>
<members>sbaa__ApprovalChain__c.ExternalID__c</members>
<members>sbaa__ApprovalCondition__c.ExternalID__c</members>
<members>sbaa__ApprovalRule__c.ExternalID__c</members>
<name>CustomField</name>
</types>
<version>40.0</version>
</Package>
And I have my code:
from lxml import etree
import sys
tree = etree.parse('package.xml')
root = tree.getroot()
print( root[0][0].tag )
As output I expect to see members but I get something like this:
{http://soap.sforce.com/2006/04/metadata}members
Why do I see that url and how to stop it from showing up?
You have defined a default namespace (Wikipedia, lxml tutorial). When defined, it is a part of every child tag.
If you want to print the tag without the namespace, it's easy
tag = root[0][0].tag
print(tag[tag.find('}')+1:])
If you want to remove the namespace from XML, see this question.

Parsing XML using Python minidom

<PacketHeader>
<HeaderField>
<name>number</name>
<dataType>int</dataType>
</HeaderField>
</PacketHeader>
This is my small XML file and I want to extract out the text which is within the name tag.
Here is my code snippet:-
from xml.dom import minidom
from xml.dom.minidom import parse
xmldoc = minidom.parse('sample.xml')
packetHeader = xmldoc.getElementsByTagName("PacketHeader")
headerField = packetHeader.getElementsByTagName("HeaderField")
for field in headerField:
getFieldName = field.getElementsByTagName("name")
print getFieldName
But I am getting the location but not the text.
from xml.dom import minidom
from xml.dom.minidom import parse
xmldoc = minidom.parse('sample.xml')
# find the name element, if found return a list, get the first element
name_element = xmldoc.getElementsByTagName("name")[0]
# this will be a text node that contains the actual text
text_node = name_element.childNodes[0]
# get text
print text_node.data
Please check this.
Update
BTW i suggest you ElementTree, Below is the code snippet using ElementTree which is doing samething as the above minidom code
import elementtree.ElementTree as ET
tree = ET.parse("sample.xml")
# the tree root is the toplevel `PacketHeader` element
print tree.findtext("HeaderField/name")
A small variant of the accepted and correct answer above is:
from xml.dom import minidom
xmldoc = minidom.parse('fichier.xml')
name_element = xmldoc.getElementsByTagName('name')[0]
print name_element.childNodes[0].nodeValue
This simply uses nodeValue instead of its alias data

python ElementTree the text of element who has a child

When I try to read a text of a element who has a child, it gives None:
See the xml (say test.xml):
<?xml version="1.0"?>
<data>
<test><ref>MemoryRegion</ref> abcd</test>
</data>
and the python code that wants to read 'abcd':
import xml.etree.ElementTree as ET
tree = ET.parse('test.xml')
root = tree.getroot()
print root.find("test").text
When I run this python, it gives None, rather than abcd.
How can I read abcd under this condition?
Use Element.tail attribute:
>>> import xml.etree.ElementTree as ET
>>> tree = ET.parse('test.xml')
>>> root = tree.getroot()
>>> print root.find(".//ref").tail
abcd
ElementTree has a rather different view of XML that is more suited for nested data. .text is the data right after a start tag. .tail is the data right after an end tag. so you want:
print root.find('test/ref').tail

Categories

Resources