Insert a tree under another tree (lxml) - python

I need to insert the whole contents of one XML tree into another tree (under its elements with a certain tag). I'm using the iter() method to iterate over the elements of the tree to be modified. The problem is, the first tree only gets inserted once for some reason.
Could anyone tell me what I'm doing wrong?
from lxml import etree
# Creating the first tree
root1 = etree.Element('root', name = 'Root number one')
tree1 = etree.ElementTree(root1)
for n in range(1, 5):
new_element = etree.SubElement(root1, 'element' + str(n))
new_child = etree.Element('child')
new_child.text = 'Test' + str(n)
new_element.insert(0, new_child)
# Creating the second tree
root2 = etree.Element('root', name = 'Root number two')
tree2 = etree.ElementTree(root2)
for n in range(1, 3):
new_element = etree.SubElement(root2, 'element')
new_child = etree.Element('child')
new_child.text = 'Test' + str(n)
new_element.insert(0, new_child)
# Printing the trees to files to see what they look like
outFile1 = open('file1.xml', 'w')
print(etree.tostring(tree1, encoding='unicode', pretty_print=True), file=outFile1)
outFile2 = open('file2.xml', 'w')
print(etree.tostring(tree2, encoding='unicode', pretty_print=True), file=outFile2)
# Here I'm using the iter() method to iterate over the elements of tree2
# Under each element tagged as "element" I need to insert the whole contents
# of tree1
for element in tree2.iter():
if element.tag == 'element':
new_child = tree1.getroot()
element.insert(0, new_child)
outFile3 = open('file3.xml', 'w')
print(etree.tostring(tree2, encoding='unicode', pretty_print=True), file=outFile3)

Quoth the lxml tutorial:
If you want to copy an element to a different position in lxml.etree, consider creating an independent deep copy using the copy module from Python's standard library.
So, in your example,
for element in tree2.iter():
if element.tag == 'element':
new_child = copy.deepcopy(tree1.getroot())
element.insert(0, new_child)

Related

If any potential issues using xpath find/update xml tag text

I wrote a script, based on some of the existing StackOverflow questions, but no one perfectly fit my issues.
The user use xpath to find a XML tag from a given XML file, and update the tag text based on user inputs.
Below is my script using Python 3 (The most difficult part for me is around the namespaces):
import xml.etree.ElementTree as ET
import sys
# user inputs and variables
filename = 'actors.xml'
xpath = 'actor/name'
value = 'test name'
temp_namespace = 'temp_namespace'
# get all namespaces
all_namespaces = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])])
# register namespace
for key in all_namespaces.keys():
ET.register_namespace(key, all_namespaces[key])
# remove all namespace from elements first
# and temp save it to tag attribute
# The below logic is copied from other Stackoverflow answers
# From **Python 3.8**, we can add the parser to insert comments
it = ET.iterparse(filename, parser=ET.XMLParser(target=ET.TreeBuilder(insert_comments=True)))
for _, el in it:
prefix, has_namespace, postfix = el.tag.partition('}')
if has_namespace:
el.tag = postfix
el.set(temp_namespace, prefix + has_namespace)
# find and update
root = it.root
for el in root.findall(xpath):
el.text = str(value)
# get xml comments before root level
doc_comments = []
with open(filename, 'r') as f:
lines = f.readlines()
for line in lines:
if line.startswith('<?xml'):
continue
if line.startswith('<' + root.tag):
break
else:
doc_comments.append(line)
def add_tag_namespace(el):
for sub_el in el:
if temp_namespace in sub_el.attrib.keys():
sub_el.tag = sub_el.attrib[temp_namespace] + sub_el.tag
del sub_el.attrib[temp_namespace]
add_tag_namespace(sub_el)
if temp_namespace in el.attrib.keys():
el.tag = el.attrib[temp_namespace] + el.tag
del el.attrib[temp_namespace]
# add all namespace back
# and delete the temp namespace attribute
add_tag_namespace(root)
# write back to xml file
tree = ET.ElementTree(root)
tree.write(filename, encoding='unicode', xml_declaration=True)
if len(doc_comments) == 0:
sys.exit()
# write xml comments before root back
lines = []
# first read all lines
with open(filename, 'r') as f:
lines = f.readlines()
# second, insert xml comments back into memory
for i, line in enumerate(lines):
if line.startswith('<?xml'):
insert_at = i + 1
for comment in doc_comments:
lines.insert(insert_at, comment)
insert_at += 1
break
# finally, write all contents to file
with open(filename, 'w') as f:
for line in lines:
f.write(line)
actors.xml:
<?xml version="1.0"?>
<actors xmlns:fictional="http://characters.example.com"
xmlns="http://people.example.com">
<actor>
<name>John Cleese</name>
<fictional:character>Lancelot</fictional:character>
<fictional:character>Archie Leach</fictional:character>
</actor>
<actor>
<name>Eric Idle</name>
<fictional:character>Sir Robin</fictional:character>
<fictional:character>Gunther</fictional:character>
<fictional:character>Commander Clement</fictional:character>
</actor>
</actors>

Want to remove all children when removing an element with lxml

Can't seem to figure out how to remove the element 'framelineName' and all the sub-elements attached to it. Bottom area in the else statement will only delete the element framelineName. I want to also delete 'line', 'left', and 'right'.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml import etree as ET
def cash_rules_everything_around_me():
shaolin = ET.SubElement(root, "Shaolin")
wtClan = ET.SubElement(root, "wtClan")
wtClan.set('StatenIsland', 'NYC')
RZA = ET.SubElement(shaolin, "RZA")
RZA.set('StatenIsland', 'NYC')
gf = ET.SubElement(RZA, "GhostfaceKillah")
rk = ET.SubElement(RZA, "Raekwon")
wutang = "36 chambers"
for wu in wutang:
if wu != "36 chambers":
wtClan.text = "A Tribe Called Quest"
else:
for w in root.xpath("//wtClan [#StatenIsland=\'NYC']"):
w.getparent().remove(w)
tree = ET.ElementTree(root)
tree.write("wutang.xml", pretty_print=True, xml_declaration=True, encoding='UTF-8')
if __name__ == '__main__':
root = ET.Element("HipHop")
cash_rules_everything_around_me()
To remove an element you need the actual element object not a list which is the return of lxml's xpath. Consider findall for iterating through element and move xpath logic to an if statement:
...
# ITERATE THROUGH A LIST (NOT STRING)
for wu in [wutang]:
if wu != "36 chambers":
wtClan.text = "A Tribe Called Quest"
else:
for w in root.findall("//wtClan"):
if w.attributed['StatenIsland']
root.remove(w)
tree = ET.ElementTree(root)
tree.write("wutang.xml", pretty_print=True,
xml_declaration=True, encoding='UTF-8')
Rextester demo (using built-in etree but compatible with lxml)

Generate XML files based on rows in CSV

I have a CSV and would like generate an XML file based on each row in the CSV.
Right now it creates an XML file but only with the last row in the CSV. How can I modify this script to generate an XML file for EACH row. And ideally have the filename based on the Column: "File / Entity Name". See below for what I currently have, Thanks!
# CSV module
import csv
# Stuff from the XML module
from xml.etree.ElementTree import Element, SubElement, tostring, ElementTree
import xml.etree.ElementTree as etree
# Topmost XML element
root = Element('root')
number = Element('number')
# Open a file
with open(r'U:\PROJECTS\Technical Graphics\book1.csv') as f:
for row in csv.DictReader(f):
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
for k, v in row.items():
child = SubElement(root, k)
child.text = v
reader = csv.DictReader(f)
tree.write(open(r'U:\PROJECTS\Technical Graphics\test.xml','w'))
print tostring(root)
You set the value of Root here:
for row in csv.DictReader(f):
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
filename = row.items()[7] # where 7 is the column your interested in
for k, v in row.items():
child = SubElement(root, k)
child.text = v
reader = csv.DictReader(f)
tree.write(open(r'U:\PROJECTS\Technical Graphics\' + filename + '.xml','w'))
print tostring(root)
You only want to create the csv.DictReader() class once, rather than for each iteration of your loop.
Similarly, you only want to create your root XML element once.
Finally, the order of the items returned from row.items() is arbitrary, and not reflective of the order of the fields in the file.
Try this:
# CSV module
import csv
# Stuff from the XML module
from xml.etree.ElementTree import Element, SubElement, tostring, ElementTree
import xml.etree.ElementTree as etree
# Topmost XML element
root = Element('root')
number = Element('number')
# Open a file
with open(r'U:\PROJECTS\Technical Graphics\book1.csv') as f:
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
reader = csv.DictReader(f)
for row in reader:
xml_row = SubElement(root, "row")
for k in reader.fieldnames:
child = SubElement(xml_row, k)
child.text = row[k]
tree.write(open(r'U:\PROJECTS\Technical Graphics\test.xml','w'))
print tostring(root)

variable in XML subelement

I'm thinking of Python code to create a dynamic xml ETREE subElement.
I have a hierarchical header to describe a peace of book as the following:
<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text
</Booktype>
</Books>
How to get a hierarchical xml tag like this :
<Books>
<Booktype>
<Story>
<Fiction>
<Young>
#here the rest of book text
</Young>
</Fiction>
</Story>
</Booktype>
</Books>
This is my code:
import re
import xml.etree.ElementTree as ET
from xml.etree import ElementTree
List= "Story>Fiction>Young"
List = List.split('>')
root = ET.Element('Books')
Booktype =ET.SubElement(root,'Booktype')
for l in List:
ND = ET.SubElement(Booktype,str(l))
Booktype.append(ND)
tree = ET.ElementTree(root)
ElementTree.tostring(root,'utf-8')
I got this bad result:
'<Books><Booktype><Story /><Story /><Story /><Fiction /><Fiction /><Young /><Young /><Story /><Story /><Fiction /><Fiction /><Young /><Young /></Booktype></Books>'
If you want to nest the list elements you have to keep the reference to the previous one so you can add the child element to it, and not to the Booktype element. See the variable currrent in the examples.
from xml.etree import ElementTree as ET
xml_string = '''<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text 2
</Booktype>
</Books>
'''
xml = ET.fromstring(xml_string)
for booktype in xml.findall('Booktype'):
types = map(lambda x: x.strip(), booktype.get('List').split('>'))
current = booktype
for t in types:
current = ET.SubElement(current, t)
current.text = booktype.text
booktype.text = ''
del booktype.attrib['List']
print ET.tostring(xml,'utf-8')
Gives me the result:
<Books>
<Booktype><Story><Fiction><Young>
#here the rest of book text
</Young></Fiction></Story></Booktype>
<Booktype><Science><Math><Young>
#here the rest of book text 2
</Young></Math></Science></Booktype>
</Books>
And if you want to create a completely new structure you can do:
xml = ET.fromstring(xml_string)
root = ET.Element('Books')
for booktype in xml.findall('Booktype'):
current = ET.SubElement(root, 'Booktype')
for t in map(lambda x: x.strip(), booktype.get('List').split('>')):
current = ET.SubElement(current, t)
current.text = booktype.text
print ET.tostring(root, 'utf-8')

ElementTree element index look up

I'm using the xml.etree.ElementTree module to create an XML document with Python 3.1 from another structured document.
What ElementTree function can I use that returns the index of an existing sub element?
The getchildren method returns a list of sub-elements of an Element object. You could then use the built-in index method of a list.
>>> import xml.etree.ElementTree as ET
>>> root = ET.Element("html")
>>> head = ET.SubElement(root, "head")
>>> body = ET.SubElement(root, "body")
>>> root.getchildren().index(body)
1
import xml.etree.ElementTree as ET
root=ET.Element('C:\Users\Administrator\Desktop\ValidationToolKit_15.9\ValidationToolKit_15.9\NE3S_VTK\webservice\history\ofas.2017-1-3.10-55-21-608.xml')
childnew=ET.SubElement(root,"354")
root.getchildren().index(childnew)
0
list(root).index(childnew)
0
def alarms_validation(self, path, alarm_no, alarm_text):
with open(path) as f:
tree = et.parse(f)
root = tree.getroot()
try:
for x in xrange(10000):
print x
for y in xrange(6):
print y
if root[x][y].text == alarm_no:
print "found"
if root[x][y+1].text != alarm_text:
print "Alarm text is not proper"
else:
print "Alarm Text is proper"
except IndexError:
pass

Categories

Resources