Remove XML node if childnode's childnode contains specific value - python

I need to filter an XML file for certain values, if the node contains this value, the node should be removed.
<?xml version="1.0" encoding="utf-8" ?>
<ogr:FeatureCollection
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://ogr.maptools.org/ TZwards.xsd"
xmlns:ogr="http://ogr.maptools.org/"
xmlns:gml="http://www.opengis.net/gml">
<gml:boundedBy></gml:boundedBy>
<gml:featureMember>
<ogr:TZwards fid="F0">
<ogr:Region_Nam>TARGET</ogr:Region_Nam>
<ogr:District_N>Kondoa</ogr:District_N>
<ogr:Ward_Name>Bumbuta</ogr:Ward_Name>
</ogr:TZwards>
</gml:featureMember>
<gml:featureMember>
<ogr:TZwards fid="F1">
<ogr:Region_Nam>REMOVE</ogr:Region_Nam>
<ogr:District_N>Kondoa</ogr:District_N>
<ogr:Ward_Name>Pahi</ogr:Ward_Name>
</ogr:TZwards>
</gml:featureMember>
</ogr:FeatureCollection>
The Python script should keep the <gml:featureMember> node if the <ogr:Region_Nam> contains TARGET and remove all other nodes.
from xml.dom import minidom
import xml.etree.ElementTree as ET
tree = ET.parse('input.xml').getroot()
removeList = list()
for child in tree.iter('gml:featureMember'):
if child.tag == 'ogr:TZwards':
name = child.find('ogr:Region_Nam').text
if (name == 'TARGET'):
removeList.append(child)
for tag in removeList:
parent = tree.find('ogr:TZwards')
parent.remove(tag)
out = ET.ElementTree(tree)
out.write(outputfilepath)
Desired output:
<?xml version="1.0" encoding="utf-8" ?>
<ogr:FeatureCollection>
<gml:boundedBy></gml:boundedBy>
<gml:featureMember>
<ogr:TZwards fid="F0">
<ogr:Region_Nam>TARGET</ogr:Region_Nam>
<ogr:District_N>Kondoa</ogr:District_N>
<ogr:Ward_Name>Bumbuta</ogr:Ward_Name>
</ogr:TZwards>
</gml:featureMember>
</ogr:FeatureCollection>
My output still contains all nodes..

You need to declare the namespaces in the python code:
from xml.dom import minidom
import xml.etree.ElementTree as ET
tree = ET.parse('/tmp/input.xml').getroot()
namespaces = {'gml': 'http://www.opengis.net/gml', 'ogr':'http://ogr.maptools.org/'}
for child in tree.findall('gml:featureMember', namespaces=namespaces):
if len(child.find('ogr:TZwards', namespaces=namespaces)):
name = child.find('ogr:TZwards', namespaces=namespaces).find('ogr:Region_Nam', namespaces=namespaces).text
if name != 'TARGET':
tree.remove(child)
out = ET.ElementTree(tree)
out.write("/tmp/out.xml")

Related

how to parse XML with namespace and attribute in Python?

hi I am trying to parse xml with namespace and attribute.
I am almost close by using root.findall() and .get()
However still struggling to get the accurate values from xml file.
How to get the xml attribute values ?
Input:
<?xml version="1.0" encoding="UTF-8"?><message:GenericData
xmlns:message="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message"
xmlns:common="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:generic="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic"
xsi:schemaLocation="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message https://sdw-
wsrest.ecb.europa.eu:443/vocabulary/sdmx/2_1/SDMXMessage.xsd
http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common https://sdw-
wsrest.ecb.europa.eu:443/vocabulary/sdmx/2_1/SDMXCommon.xsd
http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic https://sdw-
wsrest.ecb.europa.eu:443/vocabulary/sdmx/2_1/SDMXDataGeneric.xsd">
<generic:Obs>
<generic:ObsDimension value="1999-01"/>
<generic:ObsValue value="0.7029125"/>
</generic:Obs>
<generic:Obs>
<generic:ObsDimension value="1999-02"/>
<generic:ObsValue value="0.688505"/>
</generic:Obs>
Code:
import xml.etree.ElementTree as ET
tree = ET.parse("file.xml")
root = tree.getroot()
for x in root.findall('.//'):
print(x.tag, " ", x.get('value'))
Output:
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Obs None
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}ObsDimension 1999-01
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}ObsValue 0.7029125
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Obs None
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}ObsDimension 1999-02
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}ObsValue 0.688505
Expected_Output:
1999-01 0.7029125
1999-02 0.688505
How about this:
for parent in root:
print(' '.join([child.get('value', "") for child in parent]))

Iterating through xml file

I am trying to get all surnames from xml file, but if I am trying to use find, It throws an exception
TypeError: 'NoneType' object is not iterable
This is my code:
import xml.etree.ElementTree as ET
tree = ET.parse('test.xml')
root = tree.getroot()
for elem in root:
for subelem in elem:
for subsubelem in subelem.find('surname'):
print(subsubelem.text)
When I remove the find('surname') from code, It returning all texts from subsubelements.
This is xml:
<?xml version="1.0" encoding="UTF-8"?>
<pp:card xmlns:pp="http://xmlns.page.com/path/subpath">
<pp:id>1</pp:id>
<pp:customers>
<pp:customer>
<pp:name>John</pp:name>
<pp:surname>Walker</pp:surname>
<pp:adress>
<pp:street>Walker street</pp:street>
<pp:number>1/1</pp:number>
<pp:state>England</pp:state>
</pp:adress>
<pp:created>2021-03-08Z</pp:created>
</pp:customer>
<pp:customer>
<pp:name>Michael</pp:name>
<pp:surname>Jordan</pp:surname>
<pp:adress>
<pp:street>Jordan street</pp:street>
<pp:number>28</pp:number>
<pp:state>USA</pp:state>
</pp:adress>
<pp:created>2021-03-09Z</pp:created>
</pp:customer>
</pp:customers>
</pp:card>
How should I fix it?
Not really a python person, but should the "find" statement include the "pp:" in its search, such as,
find('pp:surname')
Neither the opening nor closing tags actually match "surname".
Use the namespace when you call findall
import xml.etree.ElementTree as ET
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<pp:card xmlns:pp="http://xmlns.page.com/path/subpath">
<pp:id>1</pp:id>
<pp:customers>
<pp:customer>
<pp:name>John</pp:name>
<pp:surname>Walker</pp:surname>
<pp:adress>
<pp:street>Walker street</pp:street>
<pp:number>1/1</pp:number>
<pp:state>England</pp:state>
</pp:adress>
<pp:created>2021-03-08Z</pp:created>
</pp:customer>
<pp:customer>
<pp:name>Michael</pp:name>
<pp:surname>Jordan</pp:surname>
<pp:adress>
<pp:street>Jordan street</pp:street>
<pp:number>28</pp:number>
<pp:state>USA</pp:state>
</pp:adress>
<pp:created>2021-03-09Z</pp:created>
</pp:customer>
</pp:customers>
</pp:card>'''
ns = {'pp': 'http://xmlns.page.com/path/subpath'}
root = ET.fromstring(xml)
names = [sn.text for sn in root.findall('.//pp:surname', ns)]
print(names)
output
['Walker', 'Jordan']

Extracting Child XML using ElementTree ignoring Namespace

I have the following XML that I would like to extract a portion of the child if name matches "Adam"
<data>
<a:config version="1.0" xmlns:a="uri:abc.com/a" xmlns:b="uri:abc.com/b">
<a:xxx config="ABC">
<set>option_on</set>
<location>/123/123</location>
<data>123</data>
</a:xxx>
<a:xxx name="Adam">
<a:yyy value="5555-5555">
<log>true</log>
</a:yyy>
</a:xxx>
<a:xxx name="Lisa">
<a:yyy value="2222-2222">
<log>false</log>
</a:yyy>
</a:xxx>
</a:config>
</data>
I manage to extract the section but it doesn't output the original namespace rather it is showing ns0 and ns1. Below is my code
import xml.etree.ElementTree as ET
tree2 = ET.parse("mycode.xml")
root2= tree2.getroot()
for elem in tree2.iter(tag='{uri:abc.com/a}xxx'):
match = elem.get('name')
if match == "Adam":
bla = ET.dump(elem)
Output as follows: -
<ns0:xxx xmlns:ns0="uri:abc.com/a" name="Adam">
<ns0:yyy value="5555-5555">
<log>true</log>
</ns0:yyy>
</ns0:xxx>
I am hoping to get exactly as what the original document is:-
<a:xxx name="Adam">
<a:yyy value="5555-5555">
<log>true</log>
</a:yyy>
</a:xxx>
Use the register_namespace function.
import xml.etree.ElementTree as ET
tree2 = ET.parse("mycode.xml")
root2 = tree2.getroot()
# Register the 'a' prefix to be used when serializing
ET.register_namespace("a", "uri:abc.com/a")
for elem in tree2.iter(tag='{uri:abc.com/a}xxx'):
match = elem.get('name')
if match == "Adam":
bla = ET.dump(elem)
Output:
<a:xxx xmlns:a="uri:abc.com/a" name="Adam">
<a:yyy value="5555-5555">
<log>true</log>
</a:yyy>
</a:xxx>
This is not the exact output that you asked for. You cannot force ElementTree to omit the namespace declaration (because doing so would make the output ill-formed).

How to insert children of one xml node in another xml node with python

I have follwing xml file:
<root>
<nodeA>
<childrens_A>
</nodeA>
<nodeB>
<childrens_B>
</nodeB>
<nodeA>
<childrens_A>
</nodeA>
<nodeB>
<childrens_B>
</nodeB>
</root>
I want get something like
<root>
<nodeA>
<childrens_A>
<childrens_B>
</nodeA>
<nodeA>
<childrens_A>
<childrens_B>
</nodeA>
</root>
Numbers of nodes A and B equal.
I can import only from standard python library. I cannot import lxml because access restriction. So i want be limited from xml.etree import ElementTree as et
My code is:
from xml.etree import ElementTree as et
tree = et.parse(path_in)
root = tree.getroot()
for child in root.gethcildren()
if child.tag == "nodeA"
#insert children of nodeB in nodeA
tr.write(path_out)
Thanks in advance!
Looks like i find solution:
from xml.etree import ElementTree as et
tr = et.parse(path_in)
root = tr.getroot()
for child in root.getchildren():
if child.tag == 'nodeB':
sub = child.getchildren()
i = root.getchildren().index(child)
root.getchildren()[i - 1].extend(sub)
tr.write(path_out)
Hope once this answer can help to somebody.

Parsing XML with ElementTree in Python

I have XML like this:
<parameter>
<name>ec_num</name>
<value>none</value>
<units/>
<url/>
<id>2455</id>
<m_date>2008-11-29 13:15:14</m_date>
<user_id>24</user_id>
<user_name>registry</user_name>
</parameter>
<parameter>
<name>swisspro</name>
<value>Q8H6N2</value>
<units/>
I want to parse the XML and extract the <value> entry which is just below the <name> entry marked 'swisspro'. I.e. I want to parse and extract the 'Q8H6N2' value.
How would I do this using ElementTree?
It would by much easier to do via lxml, but here' a solution using ElementTree library:
import xml.etree.ElementTree as ET
data = """<parameters>
<parameter>
<name>ec_num</name>
<value>none</value>
<units/>
<url/>
<id>2455</id>
<m_date>2008-11-29 13:15:14</m_date>
<user_id>24</user_id>
<user_name>registry</user_name>
</parameter>
<parameter>
<name>swisspro</name>
<value>Q8H6N2</value>
<units/>
</parameter>
</parameters>"""
tree = ET.fromstring(data)
for parameter in tree.iter(tag='parameter'):
name = parameter.find('name')
if name is not None and name.text == 'swisspro':
print parameter.find('value').text
break
prints:
Q8H6N2
The idea is pretty simple: iterate over all parameter tags, check the value of the name tag and if it is equal to swisspro, get the value element.
Hope that helps.
Here is an example:
xml file
<span style="font-size:13px;"><?xml version="1.0" encoding="utf-8"?>
<root>
<person age="18">
<name>hzj</name>
<sex>man</sex>
</person>
<person age="19" des="hello">
<name>kiki</name>
<sex>female</sex>
</person>
</root></span>
parse method
from xml.etree import ElementTree
def print_node(node):
'''print basic info'''
print "=============================================="
print "node.attrib:%s" % node.attrib
if node.attrib.has_key("age") > 0 :
print "node.attrib['age']:%s" % node.attrib['age']
print "node.tag:%s" % node.tag
print "node.text:%s" % node.text
def read_xml(text):
'''read xml file'''
# root = ElementTree.parse(r"D:/test.xml") #first method
root = ElementTree.fromstring(text) #second method
# get element
# 1 by getiterator
lst_node = root.getiterator("person")
for node in lst_node:
print_node(node)
# 2 by getchildren
lst_node_child = lst_node[0].getchildren()[0]
print_node(lst_node_child)
# 3 by .find
node_find = root.find('person')
print_node(node_find)
#4. by findall
node_findall = root.findall("person/name")[1]
print_node(node_findall)
if __name__ == '__main__':
read_xml(open("test.xml").read())

Categories

Resources