Reshape xml using python? - python

I have a xml like this
<data>
<B>Head1</B>
<I>Inter1</I>
<I>Inter2</I>
<I>Inter3</I>
<I>Inter4</I>
<I>Inter5</I>
<O>,</O>
<B>Head2</B>
<I>Inter6</I>
<I>Inter7</I>
<I>Inter8</I>
<I>Inter9</I>
<O>,</O>
<O> </O>
</data>
and I want the XML to look like
<data>
<combined>Head1 Inter1 Inter2 Inter3 Inter4 Inter5</combined>,
<combined>Head2 Inter6 Inter7 Inter8 Inter9</combined>
</data>
I tried to get all values of "B"
for value in mod.getiterator(tag='B'):
print (value.text)
Head1
Head2
for value in mod.getiterator(tag='I'):
print (value.text)
Inter1
Inter2
Inter3
Inter4
Inter5
Inter6
Inter7
Inter8
Inter9
Now How should I save the first iteration value to one tag and then the second one in diffrent tag. ie. How do make the iteration to start at tag "B" find all the tag "I" which are following it and then iterate again if I again find a tag "B" and save them all in a new tag.
tag "O" will always be present at the end

You can use ElementTree module from xml.etree:
from xml.etree import ElementTree
struct = """
<data>
{}
</data>
"""
def reformat(tree):
root = tree.getroot()
seen = []
for neighbor in root.iter('data'):
for child in neighbor.getchildren():
tx = child.text
if tx == ',':
yield "<combined>{}<combined>".format(' '.join(seen))
seen = []
else:
seen.append(tx)
with open('test.xml') as f:
tree = ElementTree.parse(f)
print(struct.format(',\n'.join(reformat(tree))))
result:
<data>
<combined>Head1 Inter1 Inter2 Inter3 Inter4 Inter5<combined>,
<combined>Head2 Inter6 Inter7 Inter8 Inter9<combined>
</data>
Note that if you're not sure all the blocks are separated wit comma you can simply change the condition if tx == ',': according your file format. You can also check when the tx is started with 'Head' then if seen is not empty yield the seen and clear its content, otherwise append the tx and continue.

Related

Read XML with Python tree.getroot

I am new to Python, I have this XML and this code. This is an invoice, where "SalesOrderRet" is the header and "SalesOrderLineRet" is each line of the invoice. The problem that I have is I don't know how to read the SalesOrderLineRet individually for each header. The code that I have here is adding me all the "SalesOrderLineRet" from the entire XML and not just one for the header.
def read_xml():
tree = ET.parse('LastResponse.xml')
root = tree.getroot()
form_data = {}
collection = db["tracking"]
for item in root.iter('SalesOrderRet'):
WO = item.find('RefNumber').text
TimeCreatedQB = item.find('TimeCreated').text
Client = item.find('CustomerRef/FullName').text
for items in root.iter('SalesOrderLineRet'):
descrip = getattr(items.find('Desc'), 'text', None)
For an XML file like this,
<?xml version="1.0"?>
<data>
<SalesOrderRet>
<SalesOrderLineRet>
<RefNumber>1</RefNumber>
<TimeCreated>0:00</TimeCreated>
<CustomerRef>
<FullName>John Doe</FullName>
</CustomerRef>
</SalesOrderLineRet>
<SalesOrderLineRet>
<RefNumber>2</RefNumber>
<TimeCreated>0:00</TimeCreated>
<CustomerRef>
<FullName>Jack Doe</FullName>
</CustomerRef>
</SalesOrderLineRet>
</SalesOrderRet>
<SalesOrderRet>
<SalesOrderLineRet>
<RefNumber>3</RefNumber>
<TimeCreated>0:00</TimeCreated>
<CustomerRef>
<FullName>Mary Doe</FullName>
</CustomerRef>
</SalesOrderLineRet>
<SalesOrderLineRet>
<RefNumber>4</RefNumber>
<TimeCreated>0:00</TimeCreated>
<CustomerRef>
<FullName>Susan Doe</FullName>
</CustomerRef>
</SalesOrderLineRet>
</SalesOrderRet>
</data>
This function should read the tags and attributes individually. If not already, index each <SalesOrderRet> tag and store the individual attributes under that index.
def get_xml(filename):
tree = ET.parse(filename)
root = tree.getroot()
for SalesOrderRet in root:
print(SalesOrderRet.tag, SalesOrderRet.attrib)
for SalesOrderLineRet in SalesOrderRet.iter('SalesOrderLineRet'):
print(' ', SalesOrderLineRet.tag, SalesOrderLineRet.attrib)
WO = SalesOrderLineRet.find('RefNumber').text
TimeCreatedQB = SalesOrderLineRet.find('TimeCreated').text
Client = SalesOrderLineRet.find('CustomerRef/FullName').text
print(' ', WO, TimeCreatedQB, Client)
This code is based off of the docs

Python parse and modify XML elements and subelements

I'm using ElementTree to parse and modify my XML-File with the structure below. The actual file is much bigger Platz_1 to Platz_250 but the structure is the same. Now I want to set all elements.text and subelements.text of Platz_X to "0" at once, when the element.text of "_Name" of Platz_X is None and continue with the next Platz_X+1
My problem is when i parse through the file in a loop to check all the values, I don't know how to stop my loop set all the texts to "0" and continue with the next Platz_X+1.
tree = ET.parse(xml)
root = tree.getroot()
wkz = root.getchildren()
for sub_wkz in wkz:
for platz in sub_wkz:
for child in platz:
if child.text:
if len(child.text.split()) > 0:
var = child.text
for subchild in child:
if subchild.text:
if len(child.text.split()) > 0:
var_sub = subchild.text
<?xml version='1.0' encoding='utf-8'?>
<Maschine>
<INDUSTRIE_WKZ_1>
<Platz_1>
<_Name>6006003</_Name>
<_Duplo>1</_Duplo>
<_Zustand>131</_Zustand>
<Schneide_1>
<_Sollstandzeit>60,0</_Sollstandzeit>
<_Iststandzeit>50,58213424682617</_Iststandzeit>
<_Vorwarngrenze>10,0</_Vorwarngrenze>
<_Laenge_L1>237,89599609375</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_1>
<Schneide_2>
<_Sollstandzeit>0</_Sollstandzeit>
<_Iststandzeit>0</_Iststandzeit>
<_Vorwarngrenze>0</_Vorwarngrenze>
<_Laenge_L1>0</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_2>
<Schneide_3>
<_Sollstandzeit>0</_Sollstandzeit>
<_Iststandzeit>0</_Iststandzeit>
<_Vorwarngrenze>0</_Vorwarngrenze>
<_Laenge_L1>0</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_3>
<Schneide_4>
<_Sollstandzeit>0</_Sollstandzeit>
<_Iststandzeit>0</_Iststandzeit>
<_Vorwarngrenze>0</_Vorwarngrenze>
<_Laenge_L1>0</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_4>
</Platz_1>
<INDUSTRIE_WKZ_1>
<Maschine>
I changed the XML you provided a bit :
added the missing slash (/) to the INDUSTRIE_WKZ_1 closing tag
added the missing slash (/) to the <Maschine>closing tag
removed the Schneide_2 through 4 for brevity (but it works fine with it)
added a Platz_2 whose _Name is empty (if that is what you mean by "is None") in an INDUSTRIE_WKZ_2 (so the code works if there are multiple "WKZ")
This is the input file I used :
<?xml version='1.0' encoding='utf-8'?>
<Maschine>
<INDUSTRIE_WKZ_1>
<Platz_1>
<_Name>6006003</_Name>
<_Duplo>1</_Duplo>
<_Zustand>131</_Zustand>
<Schneide_1>
<_Sollstandzeit>60,0</_Sollstandzeit>
<_Iststandzeit>50,58213424682617</_Iststandzeit>
<_Vorwarngrenze>10,0</_Vorwarngrenze>
<_Laenge_L1>237,89599609375</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_1>
</Platz_1>
</INDUSTRIE_WKZ_1>
<INDUSTRIE_WKZ_2>
<Platz_2>
<_Name></_Name>
<_Duplo>1</_Duplo>
<_Zustand>131</_Zustand>
<Schneide_1>
<_Sollstandzeit>60,0</_Sollstandzeit>
<_Iststandzeit>50,58213424682617</_Iststandzeit>
<_Vorwarngrenze>10,0</_Vorwarngrenze>
<_Laenge_L1>237,89599609375</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_1>
</Platz_2>
</INDUSTRIE_WKZ_2>
</Maschine>
I assume there is only one Maschine and that it only contains INDUSTRIE_WKZ_* which contains Platz_*.
And here is my code :
from itertools import islice
from xml.etree.ElementTree import ElementTree as ET
src_xmlfile_name = "68253543.xml"
dst_xmlfile_name = "68253543_post.xml"
ET = ET()
root = ET.parse(src_xmlfile_name)
for platz_elem in root.findall("*/*"): # all "Platz" children of "WKZ" children of the root
platz_name_elem = platz_elem.find("_Name")
if platz_name_elem.text is None:
# we want to put to 0 all values in this Platz's descendants
for platz_descendant in islice(platz_elem.iter(), 1, None): # skip the first one, which is the "Platz" elem
if (platz_descendant.tag != "_Name" # keep "_Name
and platz_descendant.text is not None # keep empty ones
and platz_descendant.text.strip() != ""): #
platz_descendant.text = "0"
ET.write(dst_xmlfile_name, encoding="utf-8", xml_declaration=True)
which produces this output :
<?xml version='1.0' encoding='utf-8'?>
<Maschine>
<INDUSTRIE_WKZ_1>
<Platz_1>
<_Name>6006003</_Name>
<_Duplo>1</_Duplo>
<_Zustand>131</_Zustand>
<Schneide_1>
<_Sollstandzeit>60,0</_Sollstandzeit>
<_Iststandzeit>50,58213424682617</_Iststandzeit>
<_Vorwarngrenze>10,0</_Vorwarngrenze>
<_Laenge_L1>237,89599609375</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_1>
</Platz_1>
</INDUSTRIE_WKZ_1>
<INDUSTRIE_WKZ_2>
<Platz_2>
<_Name />
<_Duplo>0</_Duplo>
<_Zustand>0</_Zustand>
<Schneide_1>
<_Sollstandzeit>0</_Sollstandzeit>
<_Iststandzeit>0</_Iststandzeit>
<_Vorwarngrenze>0</_Vorwarngrenze>
<_Laenge_L1>0</_Laenge_L1>
<_Laenge_L2>0</_Laenge_L2>
<_Radius>0</_Radius>
</Schneide_1>
</Platz_2>
</INDUSTRIE_WKZ_2>
</Maschine>
(including the XML declaration in the output file is based on this answer)

Adding a parent tag to a nested structure with ElementTree (Python)

I have the following structure
<root>
<data>
<config>
CONFIGURATION
<config>
</data>
</root>
With Python's ElementTree module I want to add a parent element to <config> tag as
<root>
<data>
<type>
<config>
CONFIGURATION
<config>
</type>
</data>
</root>
Also the xml file might have other config tags elsewhere but I'm only interested in the ones appearing under data tag.
This boils down to ~3 steps:
get the elements that match your criteria (tag == x, parent tag == y)
remove that element from the parent, putting a new child in that place
add the former child to the new child.
For the first step, we can use this answer. Since we know we'll need the parent later, let's keep that too in our search.
def find_elements(tree, child_tag, parent_tag):
parent_map = dict((c, p) for p in tree.iter() for c in p)
for el in tree.iter(child_tag):
parent = parent_map[el]
if parent.tag == parent_tag:
yield el, parent
steps two and three are pretty related, we can do them together.
def insert_new_els(tree, child_tag, parent_tag, new_node_tag):
to_replace = list(find_elements(tree, child_tag, parent_tag))
for child, parent in to_replace:
ix = list(parent).index(child)
new_node = ET.Element(new_node_tag)
parent.insert(ix, new_node)
parent.remove(child)
new_node.append(child)
Your tree will be modified in place.
Now usage is simply:
tree = ET.parse('some_file.xml')
insert_new_els(tree, 'config', 'data', 'type')
tree.write('some_file_processed.xml')
untested

Removing parent element and all subelements from XML

Given an XML file with the following structure:
<Root>
<Stuff></Stuff>
<MoreStuff></MoreStuff>
<Targets>
<Target>
<ID>12345</ID>
<Type>Ground</Type>
<Size>Large</Size>
</Target>
<Target>
...
</Target>
</Targets>
</Root>
I'm trying to loop through each child under the <Targets> element, check each <ID> for a specific value, and if the value is found, then I want to delete the entire <Target> entry. I've been using the ElementTree Python library with little success. Here's what I have so far:
import xml.etree.ElementTree as ET
tree = ET.parse('file.xml')
root = tree.getroot()
iterator = root.getiterator('Target')
for item in iterator:
old = item.find('ID')
text = old.text
if '12345' in text:
item.remove(old)
tree.write('out.xml')
The problem I'm having with this approach is that only the <ID> sub element is removed, however I need the entire <Target> element and all of its child elements removed. Can anyone help! Thanks.
Unfortunately, element tree elements don't know who their parents are. There is a workaround -- You can build the mapping yourself:
tree = ET.parse('file.xml')
root = tree.getroot()
parent_map = dict((c, p) for p in tree.getiterator() for c in p)
# list so that we don't mess up the order of iteration when removing items.
iterator = list(root.getiterator('Target'))
for item in iterator:
old = item.find('ID')
text = old.text
if '12345' in text:
parent_map[item].remove(item)
continue
tree.write('out.xml')
Untested
You need to keep a reference to the Targets element so that you can remove its children, so start your iteration from there. Grab each Target, check your condition and remove what you don't like.
#!/usr/bin/env python
import xml.etree.ElementTree as ET
xmlstr="""<Root>
<Stuff></Stuff>
<MoreStuff></MoreStuff>
<Targets>
<Target>
<ID>12345</ID>
<Type>Ground</Type>
<Size>Large</Size>
</Target>
<Target>
...
</Target>
</Targets>
</Root>"""
root = ET.fromstring(xmlstr)
targets = root.find('Targets')
for target in targets.findall('Target'):
_id = target.find('ID')
if _id is not None and '12345' in _id.text:
targets.remove(target)
print ET.tostring(root)

Parsing XML with ElementTree in Python

I have XML like this:
<parameter>
<name>ec_num</name>
<value>none</value>
<units/>
<url/>
<id>2455</id>
<m_date>2008-11-29 13:15:14</m_date>
<user_id>24</user_id>
<user_name>registry</user_name>
</parameter>
<parameter>
<name>swisspro</name>
<value>Q8H6N2</value>
<units/>
I want to parse the XML and extract the <value> entry which is just below the <name> entry marked 'swisspro'. I.e. I want to parse and extract the 'Q8H6N2' value.
How would I do this using ElementTree?
It would by much easier to do via lxml, but here' a solution using ElementTree library:
import xml.etree.ElementTree as ET
data = """<parameters>
<parameter>
<name>ec_num</name>
<value>none</value>
<units/>
<url/>
<id>2455</id>
<m_date>2008-11-29 13:15:14</m_date>
<user_id>24</user_id>
<user_name>registry</user_name>
</parameter>
<parameter>
<name>swisspro</name>
<value>Q8H6N2</value>
<units/>
</parameter>
</parameters>"""
tree = ET.fromstring(data)
for parameter in tree.iter(tag='parameter'):
name = parameter.find('name')
if name is not None and name.text == 'swisspro':
print parameter.find('value').text
break
prints:
Q8H6N2
The idea is pretty simple: iterate over all parameter tags, check the value of the name tag and if it is equal to swisspro, get the value element.
Hope that helps.
Here is an example:
xml file
<span style="font-size:13px;"><?xml version="1.0" encoding="utf-8"?>
<root>
<person age="18">
<name>hzj</name>
<sex>man</sex>
</person>
<person age="19" des="hello">
<name>kiki</name>
<sex>female</sex>
</person>
</root></span>
parse method
from xml.etree import ElementTree
def print_node(node):
'''print basic info'''
print "=============================================="
print "node.attrib:%s" % node.attrib
if node.attrib.has_key("age") > 0 :
print "node.attrib['age']:%s" % node.attrib['age']
print "node.tag:%s" % node.tag
print "node.text:%s" % node.text
def read_xml(text):
'''read xml file'''
# root = ElementTree.parse(r"D:/test.xml") #first method
root = ElementTree.fromstring(text) #second method
# get element
# 1 by getiterator
lst_node = root.getiterator("person")
for node in lst_node:
print_node(node)
# 2 by getchildren
lst_node_child = lst_node[0].getchildren()[0]
print_node(lst_node_child)
# 3 by .find
node_find = root.find('person')
print_node(node_find)
#4. by findall
node_findall = root.findall("person/name")[1]
print_node(node_findall)
if __name__ == '__main__':
read_xml(open("test.xml").read())

Categories

Resources