Want to remove all children when removing an element with lxml - python

Can't seem to figure out how to remove the element 'framelineName' and all the sub-elements attached to it. Bottom area in the else statement will only delete the element framelineName. I want to also delete 'line', 'left', and 'right'.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml import etree as ET
def cash_rules_everything_around_me():
shaolin = ET.SubElement(root, "Shaolin")
wtClan = ET.SubElement(root, "wtClan")
wtClan.set('StatenIsland', 'NYC')
RZA = ET.SubElement(shaolin, "RZA")
RZA.set('StatenIsland', 'NYC')
gf = ET.SubElement(RZA, "GhostfaceKillah")
rk = ET.SubElement(RZA, "Raekwon")
wutang = "36 chambers"
for wu in wutang:
if wu != "36 chambers":
wtClan.text = "A Tribe Called Quest"
else:
for w in root.xpath("//wtClan [#StatenIsland=\'NYC']"):
w.getparent().remove(w)
tree = ET.ElementTree(root)
tree.write("wutang.xml", pretty_print=True, xml_declaration=True, encoding='UTF-8')
if __name__ == '__main__':
root = ET.Element("HipHop")
cash_rules_everything_around_me()

To remove an element you need the actual element object not a list which is the return of lxml's xpath. Consider findall for iterating through element and move xpath logic to an if statement:
...
# ITERATE THROUGH A LIST (NOT STRING)
for wu in [wutang]:
if wu != "36 chambers":
wtClan.text = "A Tribe Called Quest"
else:
for w in root.findall("//wtClan"):
if w.attributed['StatenIsland']
root.remove(w)
tree = ET.ElementTree(root)
tree.write("wutang.xml", pretty_print=True,
xml_declaration=True, encoding='UTF-8')
Rextester demo (using built-in etree but compatible with lxml)

Related

How to indent my xml data which in xml file python [duplicate]

After reading from an existing file with 'ugly' XML and doing some modifications, pretty printing doesn't work. I've tried etree.write(FILE_NAME, pretty_print=True).
I have the following XML:
<testsuites tests="14" failures="0" disabled="0" errors="0" time="0.306" name="AllTests">
<testsuite name="AIR" tests="14" failures="0" disabled="0" errors="0" time="0.306">
....
And I use it like this:
tree = etree.parse('original.xml')
root = tree.getroot()
...
# modifications
...
with open(FILE_NAME, "w") as f:
tree.write(f, pretty_print=True)
For me, this issue was not solved until I noticed this little tidbit here:
http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output
Short version:
Read in the file with this command:
>>> parser = etree.XMLParser(remove_blank_text=True)
>>> tree = etree.parse(filename, parser)
That will "reset" the already existing indentation, allowing the output to generate it's own indentation correctly. Then pretty_print as normal:
>>> tree.write(<output_file_name>, pretty_print=True)
Well, according to the API docs, there is no method "write" in the lxml etree module. You've got a couple of options in regards to getting a pretty printed xml string into a file. You can use the tostring method like so:
f = open('doc.xml', 'w')
f.write(etree.tostring(root, pretty_print=True))
f.close()
Or, if your input source is less than perfect and/or you want more knobs and buttons to configure your out put you could use one of the python wrappers for the tidy lib.
http://utidylib.berlios.de/
import tidy
f.write(tidy.parseString(your_xml_str, **{'output_xml':1, 'indent':1, 'input_xml':1}))
http://countergram.com/open-source/pytidylib
from tidylib import tidy_document
document, errors = tidy_document(your_xml_str, options={'output_xml':1, 'indent':1, 'input_xml':1})
f.write(document)
fp = file('out.txt', 'w')
print(e.tree.tostring(...), file=fp)
fp.close()
Here is an answer that is fixed to work with Python 3:
from lxml import etree
from sys import stdout
from io import BytesIO
parser = etree.XMLParser(remove_blank_text = True)
file_obj = BytesIO(text)
tree = etree.parse(file_obj, parser)
tree.write(stdout.buffer, pretty_print = True)
where text is the xml code as a sequence of bytes.
I am not sure why other answers did not mention this. If you want to obtain the root of the xml there is a method called getroot(). I hope I answered your question (though a little late).
tree = et.parse(xmlFile)
root = tree.getroot()
Of course - pretty print of lxml.etree is possible.
In my case, the old trick with remove_blank_text=True and pretty_print=True was not working as I expected (was too delicate), so I decided to write it by myself.
Here is it - a modern, forcible, native pythonic way to correct lxml.etee.Element tree indentation.
This gives a nicely prettified XML string:
from typing import Optional
import lxml.etree
def indent_lxml(element: lxml.etree.Element, level: int = 0, is_last_child: bool = True) -> None:
space = " "
indent_str = "\n" + level * space
element.text = strip_or_null(element.text)
if element.text:
element.text = f"{indent_str}{space}{element.text}"
num_children = len(element)
if num_children:
element.text = f"{element.text or ''}{indent_str}{space}"
for index, child in enumerate(element.iterchildren()):
is_last = index == num_children - 1
indent_lxml(child, level + 1, is_last)
elif element.text:
element.text += indent_str
tail_level = max(0, level - 1) if is_last_child else level
tail_indent = "\n" + tail_level * space
tail = strip_or_null(element.tail)
element.tail = f"{indent_str}{tail}{tail_indent}" if tail else tail_indent
def strip_or_null(text: Optional[str]) -> Optional[str]:
if text is not None:
return text.strip() or None
It's decent fast, because it doesn't allocate any additional structures in memory and also traversing the tree - it visits each node only once, giving the best possible - O x N computational complexity.
It rearranges all the existing indentation "in place" in the tree (the DOM) by correcting contents of Element.text and Element.tail attributes (affects white-spaces only).
Naturally, it also can be used with HTML parsed by lxml.
In order to use it, do something like that:
root = lxml.etree.parse("path/to/the_file.xml").getroot()
# or
root = lxml.etree.fromstring("<xml><body><leaf1/><leaf2/></body></xml>")
indent_lxml(root) # corrects indentation "in place"
result = lxml.etree.tostring(root, encoding="unicode")
print(result)
Which prints:
<xml>
<body>
<leaf1/>
<leaf2/>
</body>
</xml>

How can I parse the below XML data using Python?

Source XML
<?xml version='1.0' encoding='UTF-8'?>
<ProcessType xmlns:xmi="http://www.omg.org/XMI" xmi:version="2.0" defaultContext="Default">
<node componentName="tRedshiftRow" componentVersion="0.102" offsetLabelX="0" offsetLabelY="0" posX="-32" posY="96">
<elementParameter field="TECHNICAL" name="QUERYSTORE:QUERYSTORE_TYPE" value="BUILT_IN"/>
<elementParameter field="TEXT" name="DBNAME" value=""""/>
<elementParameter field="TEXT" name="SCHEMA_DB" value=""""/>
<elementParameter field="MEMO_SQL" name="QUERY" value=""DELETE FROM schema.tablename;""/>
</node>
</ProcessType>
I want to get the DELETE statement only where tag is "QUERY", and write it in a text file.
Expected output : DELETE FROM schema.tablename;
I was trying the following way, which obviously didn't work out !
from lxml import etree, objectify
import xml.etree.ElementTree as ET
def convert_xml_to_comp():
metadata = 'source.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
for elem in root.getiterator():
# print(elem)
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1 :]
objectify.deannotate(root, cleanup_namespaces=True)
tree.write('done.xml', pretty_print=True, xml_declaration=True, encoding='UTF-8')
tree = ET.parse('done.xml')
root = tree.getroot()
def get_sql_text():
file = open( "newdelete.txt", "w")
for root in tree.getroot():
### Get the elements' names ###
for elementParameter in root.iterfind('elementParameter[#name="UNIQUE_NAME"]') :
name=elementParameter.get('value')
### Get the elements' name and SQL ###
for elementParameter in root.iterfind('elementParameter[#name="QUERY"]') :
#print (root.attrib)
val=elementParameter.get('value')
print(root.find('val[#value="DELETE FROM schema.tablename;"]'))
file.close()
get_sql_text()
if __name__ == '__main__':
convert_xml_to_comp()
You do this all in a just a couple of statements using an xpath query. Something like:
>>> from lxml import etree
>>> doc = etree.parse(open('data.xml'))
>>> query = doc.xpath('//elementParameter[#name="QUERY"]')[0].get('value')
>>> print(query)
"DELETE FROM schema.tablename;"
This says "find all the elementParameter elements with name="QUERY" and then return the value of the value attribute of the first one.
To select just those elements that contain "DELETE" in their value attribute, use the contains() function:
>>> doc.xpath('//elementParameter[#name="QUERY" and contains(#value, "DELETE")]')

Insert a tree under another tree (lxml)

I need to insert the whole contents of one XML tree into another tree (under its elements with a certain tag). I'm using the iter() method to iterate over the elements of the tree to be modified. The problem is, the first tree only gets inserted once for some reason.
Could anyone tell me what I'm doing wrong?
from lxml import etree
# Creating the first tree
root1 = etree.Element('root', name = 'Root number one')
tree1 = etree.ElementTree(root1)
for n in range(1, 5):
new_element = etree.SubElement(root1, 'element' + str(n))
new_child = etree.Element('child')
new_child.text = 'Test' + str(n)
new_element.insert(0, new_child)
# Creating the second tree
root2 = etree.Element('root', name = 'Root number two')
tree2 = etree.ElementTree(root2)
for n in range(1, 3):
new_element = etree.SubElement(root2, 'element')
new_child = etree.Element('child')
new_child.text = 'Test' + str(n)
new_element.insert(0, new_child)
# Printing the trees to files to see what they look like
outFile1 = open('file1.xml', 'w')
print(etree.tostring(tree1, encoding='unicode', pretty_print=True), file=outFile1)
outFile2 = open('file2.xml', 'w')
print(etree.tostring(tree2, encoding='unicode', pretty_print=True), file=outFile2)
# Here I'm using the iter() method to iterate over the elements of tree2
# Under each element tagged as "element" I need to insert the whole contents
# of tree1
for element in tree2.iter():
if element.tag == 'element':
new_child = tree1.getroot()
element.insert(0, new_child)
outFile3 = open('file3.xml', 'w')
print(etree.tostring(tree2, encoding='unicode', pretty_print=True), file=outFile3)
Quoth the lxml tutorial:
If you want to copy an element to a different position in lxml.etree, consider creating an independent deep copy using the copy module from Python's standard library.
So, in your example,
for element in tree2.iter():
if element.tag == 'element':
new_child = copy.deepcopy(tree1.getroot())
element.insert(0, new_child)

variable in XML subelement

I'm thinking of Python code to create a dynamic xml ETREE subElement.
I have a hierarchical header to describe a peace of book as the following:
<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text
</Booktype>
</Books>
How to get a hierarchical xml tag like this :
<Books>
<Booktype>
<Story>
<Fiction>
<Young>
#here the rest of book text
</Young>
</Fiction>
</Story>
</Booktype>
</Books>
This is my code:
import re
import xml.etree.ElementTree as ET
from xml.etree import ElementTree
List= "Story>Fiction>Young"
List = List.split('>')
root = ET.Element('Books')
Booktype =ET.SubElement(root,'Booktype')
for l in List:
ND = ET.SubElement(Booktype,str(l))
Booktype.append(ND)
tree = ET.ElementTree(root)
ElementTree.tostring(root,'utf-8')
I got this bad result:
'<Books><Booktype><Story /><Story /><Story /><Fiction /><Fiction /><Young /><Young /><Story /><Story /><Fiction /><Fiction /><Young /><Young /></Booktype></Books>'
If you want to nest the list elements you have to keep the reference to the previous one so you can add the child element to it, and not to the Booktype element. See the variable currrent in the examples.
from xml.etree import ElementTree as ET
xml_string = '''<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text 2
</Booktype>
</Books>
'''
xml = ET.fromstring(xml_string)
for booktype in xml.findall('Booktype'):
types = map(lambda x: x.strip(), booktype.get('List').split('>'))
current = booktype
for t in types:
current = ET.SubElement(current, t)
current.text = booktype.text
booktype.text = ''
del booktype.attrib['List']
print ET.tostring(xml,'utf-8')
Gives me the result:
<Books>
<Booktype><Story><Fiction><Young>
#here the rest of book text
</Young></Fiction></Story></Booktype>
<Booktype><Science><Math><Young>
#here the rest of book text 2
</Young></Math></Science></Booktype>
</Books>
And if you want to create a completely new structure you can do:
xml = ET.fromstring(xml_string)
root = ET.Element('Books')
for booktype in xml.findall('Booktype'):
current = ET.SubElement(root, 'Booktype')
for t in map(lambda x: x.strip(), booktype.get('List').split('>')):
current = ET.SubElement(current, t)
current.text = booktype.text
print ET.tostring(root, 'utf-8')

ElementTree element index look up

I'm using the xml.etree.ElementTree module to create an XML document with Python 3.1 from another structured document.
What ElementTree function can I use that returns the index of an existing sub element?
The getchildren method returns a list of sub-elements of an Element object. You could then use the built-in index method of a list.
>>> import xml.etree.ElementTree as ET
>>> root = ET.Element("html")
>>> head = ET.SubElement(root, "head")
>>> body = ET.SubElement(root, "body")
>>> root.getchildren().index(body)
1
import xml.etree.ElementTree as ET
root=ET.Element('C:\Users\Administrator\Desktop\ValidationToolKit_15.9\ValidationToolKit_15.9\NE3S_VTK\webservice\history\ofas.2017-1-3.10-55-21-608.xml')
childnew=ET.SubElement(root,"354")
root.getchildren().index(childnew)
0
list(root).index(childnew)
0
def alarms_validation(self, path, alarm_no, alarm_text):
with open(path) as f:
tree = et.parse(f)
root = tree.getroot()
try:
for x in xrange(10000):
print x
for y in xrange(6):
print y
if root[x][y].text == alarm_no:
print "found"
if root[x][y+1].text != alarm_text:
print "Alarm text is not proper"
else:
print "Alarm Text is proper"
except IndexError:
pass

Categories

Resources