Parse xml with python getting x,y values

Parse xml with python getting x,y values - python

i need to get x,y localisation from xml file
-<TwoDimensionSpatialCoordinate>
<coordinateIndex value="0"/>
<x value="302.6215607602997"/>
<y value="166.6285651861381"/>
</TwoDimensionSpatialCoordinate>
from xml.dom import minidom
doc = minidom.parse("1.631791322.58809740.14.834982.40440.3641459051.955.6373933.1920.xml")
"""doc.getElementsByTagName returns NodeList
coordinate = doc.getElementsByTagName("coordinateIndex")[0]
print(coordinate.firstChild.data)
"""
coordinate = doc.getElementsByTagName("coordinateIndex")
for coordinateIndex in coordinate:
value = coordinateIndex.getAttribute("value")
coordinatex = doc.getElementsByTagName("x")
for x in coordinatex:
valuex = x.getAttribute("value")
coordinatey = doc.getElementsByTagName("y")
for y in coordinatey:
valuey = y.getAttribute("value")
print("value:%s, x:%s,y:%s" % (value, x , y))
so when i execute i get this result
value:22, x:,y:
can Anyone help me please ?:(

As your example xml file
<?xml version="1.0" ?>
<TwoDimensionSpatialCoordinate>
<coordinateIndex value="0"/>
<x value="302.6215607602997"/>
<y value="166.6285651861381"/>
<coordinateIndex value="1"/>
<x value="3.6215607602997"/>
<y value="1.6285651861381"/>
</TwoDimensionSpatialCoordinate>
import xml.dom.minidom
def main(file):
doc = xml.dom.minidom.parse(file)
values = doc.getElementsByTagName("coordinateIndex")
coordX = doc.getElementsByTagName("x")
coordY = doc.getElementsByTagName("y")
d = {}
for atr_value, atr_x, atr_y in zip(values, coordX, coordY):
value = atr_value.getAttribute('value')
x = atr_x.getAttribute('value')
y = atr_y.getAttribute('value')
d[value] = [x, y]
return d
result = main('/path/file.xml')
print(result)
# {'0': ['302.621', '166.628'], '1': ['3.621', '1.628']}

Using the ElementTree API (use ET.parse(filename).getroot() instead of ET.XML() to load from a file):
from xml.etree import ElementTree as ET
xml = ET.XML("""
<?xml version="1.0" ?>
<Things>
<TwoDimensionSpatialCoordinate>
<coordinateIndex value="0"/>
<x value="302.6215607602997"/>
<y value="166.6285651861381"/>
</TwoDimensionSpatialCoordinate>
<TwoDimensionSpatialCoordinate>
<coordinateIndex value="1"/>
<x value="3.6215607602997"/>
<y value="1.6285651861381"/>
</TwoDimensionSpatialCoordinate>
</Things>
""".strip())
coords_by_index = {}
for coord in xml.findall(".//TwoDimensionSpatialCoordinate"):
coords_by_index[coord.find("coordinateIndex").get("value")] = (
coord.find("x").get("value"),
coord.find("y").get("value"),
)
print(coords_by_index)
outputs
{
'0': ('302.6215607602997', '166.6285651861381'),
'1': ('3.6215607602997', '1.6285651861381'),
}

Related

Txt to XML Python parser

I tried to parse a .txt file that looks like this:
-------------------------------------------------------------------------------
Compare Results
Compare Directory 1 : /data/Run_288/bitmaps
Compare Directory 2 : /data/Run_301/bitmaps
-------------------------------------------------------------------------------
idx, Filename , Exact, F3x3, F5x5, F7x7, Threshold, P/F
-------------------------------------------------------------------------------
1, ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00001.tif, 0, 0, 0, 0, 0, PASS
2, ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00002.tif, 0, 0, 0, 0, 0, PASS
-------------------------------------------------------------------------------
Bitmap Compare FAILURE !!! Threshold Exceeded : Threshold Values : Exact = 0 : Fuzzy 3x3 = 200 : Fuzzy 5x5 = 100 : Fuzzy 7x7 = 50 : Threshold 7x7 = 0
3, MIME_Test3_Job_setup__600X600_50_default_default_PPST56_003.mjm_20190521213826_00001.tif, 2083, 1180, 650, 262, 52, FAIL
-------------------------------------------------------------------------------
I need to obtain an xml with this format:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<Suite date="2019-05-27T10:47:03" compareDir1="/data/Run_288/bitmaps" compareDir2="/data/Run_301/bitmaps">
<Test name="ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00001.tif" result="pass">
</Test>
<Test name="ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00002.tif" result="pass">
</Test>
<Test name="MIME_Test3_Job_setup__600X600_50_default_default_PPST56_003.mjm_20190521213826_00001.tif" result="crash">
</Test>
</Suite>
This is the code that should do the work.The problem is that is not working and with my little python knowledge I don't know why.Can somebody help me please with this?!
Thank you!
import xml.etree.ElementTree as ET
root = ET.Element('Suite')
with open('file3.txt') as f:
lines = f.read().splitlines()
print(lines)
#add first subelement
celldata = ET.SubElement(root, 'Test')
import itertools as it
#for every line in input file
#group consecutive dedup to one
for line in it.groupby(lines):
line=line[0]
#if its a break of subelements - that is an empty space
if not line:
#add the next subelement and get it as celldata
celldata = ET.SubElement(root, 'test')
else:
#otherwise, split with : to get the tag name
tag = line.split(",")
#format tag name
el=ET.SubElement(celldata,tag[1])
print(tag[1])
print(tag[7])
tag=' '.join(tag[1]).strip()
if 'PASS' in line:
tag = line.split(",")[-1].strip()
elif 'FAILURE' in line:
splist = filter(None,line.split(" "))
tag = splist[splist.index(',')+1]
el.text = tag
#prettify xml
import xml.dom.minidom as minidom
formatedXML = minidom.parseString(
ET.tostring(
root)).toprettyxml(indent=" ",encoding='utf-8').strip()
# Display for debugging
print formatedXML
#write the formatedXML to file.
with open("results.xml","w+") as f:
f.write(formatedXML)

For this I would use regular expressions. My take:
data = '''-------------------------------------------------------------------------------
Compare Results
Compare Directory 1 : /data/Run_288/bitmaps
Compare Directory 2 : /data/Run_301/bitmaps
-------------------------------------------------------------------------------
idx, Filename , Exact, F3x3, F5x5, F7x7, Threshold, P/F
-------------------------------------------------------------------------------
1, ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00001.tif, 0, 0, 0, 0, 0, PASS
2, ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00002.tif, 0, 0, 0, 0, 0, PASS
-------------------------------------------------------------------------------
Bitmap Compare FAILURE !!! Threshold Exceeded : Threshold Values : Exact = 0 : Fuzzy 3x3 = 200 : Fuzzy 5x5 = 100 : Fuzzy 7x7 = 50 : Threshold 7x7 = 0
3, MIME_Test3_Job_setup__600X600_50_default_default_PPST56_003.mjm_20190521213826_00001.tif, 2083, 1180, 650, 262, 52, FAIL
-------------------------------------------------------------------------------'''
import re
dirs = []
for d in re.findall('Compare Directory\s+(\d+)\s*:\s*(.*?)$', data, flags=re.DOTALL|re.MULTILINE):
dirs += [d]
passes = []
fails = []
for line in data.split('\n'):
for p in re.findall('(\d+,\s+(.*?),.*?PASS)$', line):
passes += [p]
for f in re.findall('(\d+,\s+(.*?),.*?FAIL)$', line):
fails += [f]
s = f'''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<Suite date="2019-05-27T10:47:03" compareDir1="{dirs[0][1]}" compareDir2="{dirs[1][1]}">
'''
for p in passes:
s += f''' <Test name="{p[1]}" result="pass">
</Test>
'''
for fail in fails:
s += f''' <Test name="{fail[1]}" result="crash">
</Test>
'''
s += '''</Suite>'''
print(s)
Prints:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<Suite date="2019-05-27T10:47:03" compareDir1="/data/Run_288/bitmaps" compareDir2="/data/Run_301/bitmaps">
<Test name="ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00001.tif" result="pass">
</Test>
<Test name="ASCII_APPE_600X450_150_colorManBasic2.blackGrayReproductionImage_0_2p_color_test_four_object.pdf_20190522005734_00002.tif" result="pass">
</Test>
<Test name="MIME_Test3_Job_setup__600X600_50_default_default_PPST56_003.mjm_20190521213826_00001.tif" result="crash">
</Test>
</Suite>

Merge output loop lxml

I want to take out some element from xml which look up from variable.
here is my.xml file:
<?xml version='1.0' encoding='UTF-8'?>
<ArrayOfSalesOrderHeader xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<SalesOrderHeader>
<TenantCode>15152343</TenantCode>
<SalesOrderDetails>
<SalesOrderDetail>
<ItemCode>20072129</ItemCode>
</SalesOrderDetail>
<SalesOrderDetail>
<ItemCode>67332054</ItemCode>
</SalesOrderDetail>
<SalesOrderDetail>
<ItemCode>20206133</ItemCode>
</SalesOrderDetail>
<SalesOrderDetail>
<ItemCode>62071796</ItemCode>
</SalesOrderDetail>
</SalesOrderDetails>
</SalesOrderHeader>
</ArrayOfSalesOrderHeader>
this is my script:
doc = ET.parse("my.xml")
arrDat = '20206133'
fol = doc.xpath('.//SalesOrderDetail[descendant::ItemCode[not(contains(text(),"' + arrDat + '"))]]')
for SOD in fol :
SOD.getparent().remove(SOD)
doc.write('output.xml', xml_declaration=True, encoding='utf-8', method="xml")
The problem when i defined arrDat as array:
doc = ET.parse("my.xml")
arrDat = ['20072129','67332054']
cnt = 0
while cnt < len(arrDat) :
fol = doc.xpath('.//SalesOrderDetail[descendant::ItemCode[not(contains(text(),"' + arrDat[cnt] + '"))]]')
for SOD in fol :
SOD.getparent().remove(SOD)
doc.write('output.xml', xml_declaration=True, encoding='utf-8', method="xml")
cnt += 1
i need output.xml to be like:
<?xml version='1.0' encoding='UTF-8'?>
<ArrayOfSalesOrderHeader xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<SalesOrderHeader>
<TenantCode>15152343</TenantCode>
<SalesOrderDetails>
<SalesOrderDetail>
<ItemCode>20072129</ItemCode>
</SalesOrderDetail>
<SalesOrderDetail>
<ItemCode>67332054</ItemCode>
</SalesOrderDetail>
</SalesOrderDetails>
</SalesOrderHeader>
</ArrayOfSalesOrderHeader>

I think you can simply check the item node value and remove the one not present on your list. Here is the implementation:
from lxml import etree as ET
doc = ET.parse("data1.xml")
arrDat = ['20072129', '67332054']
for order in doc.xpath("//SalesOrderDetail"):
item = order.xpath('ItemCode')
item_code = item[0].text
if item_code not in arrDat:
order.getparent().remove(order)
doc.write('output.xml', xml_declaration=True, encoding='utf-8', method="xml")

Create a dataframe from nested xml and generate a csv

I have an XML file like this:
<?xml version="1.0"?>
<PropertySet>
<PropertySet NumOutputObjects="1" >
<Message IntObjectName="Class Def" MessageType="Integration Object">
<ListOf_Class_Def>
<ImpExp Type="CLASS_DEF" Name="lp_pkg_cla" Object_Num="1001p">
<ListOfObject_Def>
<Object_Def Ancestor_Num="" Ancestor_Name="">
</Object_Def>
</ListOfObject_Def>
<ListOfObject_Arrt>
<Object_Arrt Orig_Id="6666p" Attr_Name="LP_Portable">
</Object_Arrt>
</ListOfObject_Arrt>
</ImpExp>
</ListOf_Class_Def>
</Message>
</PropertySet>
<PropertySet NumOutputObjects="1" >
<Message IntObjectName="Class Def" MessageType="Integration Object">
<ListOf_Class_Def>
<ImpExp Type="CLASS_DEF" Name="M_pkg_cla" Object_Num="1023i">
<ListOfObject_Def>
<Object_Def Ancestor_Num="" Ancestor_Name="">
</Object_Def>
</ListOfObject_Def>
<ListOfObject_Arrt>
<Object_Arrt Orig_Id="7010p" Attr_Name="O_Portable">
</Object_Arrt>
<Object_Arrt Orig_Id="7012j" Attr_Name="O_wireless">
</Object_Arrt>
</ListOfObject_Arrt>
</ImpExp>
</ListOf_Class_Def>
</Message>
</PropertySet>
<PropertySet NumOutputObjects="1" >
<Message IntObjectName="Prod Def" MessageType="Integration Object">
<ListOf_Prod_Def>
<ImpExp Type="PROD_DEF" Name="Laptop" Object_Num="2008a">
<ListOfObject_Def>
<Object_Def Ancestor_Num="1001p" Ancestor_Name="lp_pkg_cla">
</Object_Def>
</ListOfObject_Def>
<ListOfObject_Arrt>
</ListOfObject_Arrt>
</ImpExp>
</ListOf_Prod_Def>
</Message>
</PropertySet>
<PropertySet NumOutputObjects="1" >
<Message IntObjectName="Prod Def" MessageType="Integration Object">
<ListOf_Prod_Def>
<ImpExp Type="PROD_DEF" Name="Mouse" Object_Num="2987d">
<ListOfObject_Def>
<Object_Def Ancestor_Num="1023i" Ancestor_Name="M_pkg_cla">
</Object_Def>
</ListOfObject_Def>
<ListOfObject_Arrt>
</ListOfObject_Arrt>
</ImpExp>
</ListOf_Prod_Def>
</Message>
</PropertySet>
<PropertySet NumOutputObjects="1" >
<Message IntObjectName="Prod Def" MessageType="Integration Object">
<ListOf_Prod_Def>
<ImpExp Type="PROD_DEF" Name="Speaker" Object_Num="5463g">
<ListOfObject_Def>
<Object_Def Ancestor_Num="" Ancestor_Name="">
</Object_Def>
</ListOfObject_Def>
<ListOfObject_Arrt>
</ListOfObject_Arrt>
</ImpExp>
</ListOf_Prod_Def>
</Message>
</PropertySet>
</PropertySet>
I am hoping to extract the Name, Object_Num, Orig_Id and Attr_Name tags from it using Python and convert them into a .csv format.
The .csv format I'd like to see it in is simply:
ProductId Product AttributeId Attribute
2008a Laptop 6666p LP_Portable
2987d Mouse 7010p O_Portable
2987d Mouse 7012p O_Wireless
5463g Speaker "" ""
Actually there is a relationship like this in xml tags:
All products are in the tags, "ImpExp Type="PROD_DEF".. "
All attributes are in the tags, "ImpExp Type="CLASS_DEF".. "
If a product has attributes, then there is a tag
<Object_Def Ancestor_Num="1023i".. >
The Ancestor_Num is equal to Object_Num in tags,
Type="CLASS_DEF"..
I have tried this:
from lxml import etree
import pandas
import HTMLParser
inFile = "./newm.xml"
outFile = "./new.csv"
ctx1 = etree.iterparse(inFile, tag=("ImpExp", "ListOfObject_Def", "ListOfObject_Arrt",))
hp = HTMLParser.HTMLParser()
csvData = []
csvData1 = []
csvData2 = []
csvData3 = []
csvData4 = []
csvData5 = []
for event, elem in ctx1:
value1 = elem.get("Type")
value2 = elem.get("Name")
value3 = elem.get("Object_Num")
value4 = elem.get("Ancestor_Num")
value5 = elem.get("Orig_Id")
value6 = elem.get("Attr_Name")
if value1 == "PROD_DEF":
csvData.append(value2)
csvData1.append(value3)
for event, elem in ctx1:
if value4 is not None:
csvData2.append(value4)
elem.clear()
df = pandas.DataFrame({'Product':csvData, 'ProductId':csvData1, 'AncestorId':csvData2})
for event, elem in ctx1:
if value1 == "Class Def":
csvData3.append(value3)
csvData4.append(value5)
csvData5.append(value6)
elem.clear()
df1 = pandas.DataFrame({'AncestorId':csvData3, 'AttribId':csvData4, 'AttribName':csvData5})
dff = pandas.merge(df, df1, on="AncestorId")
dff.to_csv(outFile, index = False)

Consider XSLT, the special purpose language designed to transform XML files and can directly convert XML to CSV (i.e., text file) without the pandas dataframe intermediary. Python's third-party module lxml (which you are already using) can run XSLT 1.0 scripts and do so without for loops or if logic. However, due to the complex alignment of product and attributes, some longer XPath searches are used with XSLT.
XSLT (save as .xsl file, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="no" method="text"/>
<xsl:strip-space elements="*"/>
<xsl:param name="delimiter">,</xsl:param>
<xsl:template match="/PropertySet">
<xsl:text>ProductId,Product,AttributeId,Attribute
</xsl:text>
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="PropertySet|Message|ListOf_Class_Def|ListOf_Prod_Def|ImpExp">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="ListOfObject_Arrt">
<xsl:apply-templates select="Object_Arrt"/>
<xsl:if test="name(*) != 'Object_Arrt' and preceding-sibling::ListOfObject_Def/Object_Def/#Ancestor_Name = ''">
<xsl:value-of select="concat(ancestor::ImpExp/#Name, $delimiter,
ancestor::ImpExp/#Object_Num, $delimiter,
'', $delimiter,
'')"/><xsl:text>
</xsl:text>
</xsl:if>
</xsl:template>
<xsl:template match="Object_Arrt">
<xsl:variable name="attrName" select="ancestor::ImpExp/#Name"/>
<xsl:value-of select="concat(/PropertySet/PropertySet/Message[#IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/#Ancestor_Name = $attrName]/#Name, $delimiter,
/PropertySet/PropertySet/Message[#IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/#Ancestor_Name = $attrName]/#Object_Num, $delimiter,
#Orig_Id, $delimiter,
#Attr_Name)"/><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
Python
import lxml.etree as et
# LOAD XML AND XSL
xml = et.parse('Input.xml')
xsl = et.parse('XSLT_Script.xsl')
# RUN TRANSFORMATION
transform = et.XSLT(xsl)
result = transform(xml)
# OUTPUT TO FILE
with open('Output.csv', 'wb') as f:
f.write(result)
Output
ProductId,Product,AttributeId,Attribute
Laptop,2008a,6666p,LP_Portable
Mouse,2987d,7010p,O_Portable
Mouse,2987d,7012j,O_wireless
Speaker,5463g,,

You would need to preparse all of the CLASS_DEF entries into a dictionary. These can then be looked up when processing the PROD_DEF entries:
import csv
from lxml import etree
inFile = "./newm.xml"
outFile = "./new.csv"
tree = etree.parse(inFile)
class_defs = {}
# First extract all the CLASS_DEF entries into a dictionary
for impexp in tree.iter("ImpExp"):
name = impexp.get('Name')
if impexp.get('Type') == "CLASS_DEF":
for list_of_object_arrt in impexp.findall('ListOfObject_Arrt'):
class_defs[name] = [(obj.get('Orig_Id'), obj.get('Attr_Name')) for obj in list_of_object_arrt]
with open(outFile, 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['ProductId', 'Product', 'AttributeId', 'Attribute'])
for impexp in tree.iter("ImpExp"):
object_num = impexp.get('Object_Num')
name = impexp.get('Name')
if impexp.get('Type') == "PROD_DEF":
for list_of_object_def in impexp.findall('ListOfObject_Def'):
for obj in list_of_object_def:
ancestor_num = obj.get('Ancestor_Num')
ancestor_name = obj.get('Ancestor_Name')
csv_output.writerow([object_num, name] + list(class_defs.get(ancestor_name, [['', '']])[0]))
This would produce new.csv containing:
ProductId,Product,AttributeId,Attribute
2008a,Laptop,6666p,LP_Portable
2987d,Mouse,7010p,O_Portable
5463g,Speaker,,
If you are using Python 3.x, use:
with open(outFile, 'w', newline='') as f_output:

XML file parsing - Get data from each parent and their own children

I would like to get data from each parent and their own children fro an XML file.
I'm trying to parse this XML file
<DB>
<Entry>
<Name>Assembly.iam</Name>
<DisplayName>Assembly.iam</DisplayName>
<Scalar>
<Name>d0</Name>
<DisplayName>d0 (value = 0 mm)</DisplayName>
<Value>0</Value>
</Scalar>
<Scalar>
<Name>d1</Name>
<DisplayName>d1 (value = 0 mm)</DisplayName>
<Value>0</Value>
</Scalar>
</Entry>
<Entry>
<Name>Ground.ipt</Name>
<DisplayName>Ground.ipt</DisplayName>
<Scalar>
<Name>Ground_length</Name>
<DisplayName>Ground_length (value = 160 mm)</DisplayName>
<Value>160</Value>
</Scalar>
<Scalar>
<Name>d2</Name>
<DisplayName>d2 (value = 80 mm)</DisplayName>
<Value>80</Value>
</Scalar>
</Entry>
</DB>
In fact, I would like to get the data which are into <DisplayName></DisplayName>.
Then, I would like to put that data into an array of tuples like this
[(Assembly.iam,[d0 (value = 0 mm), d1 (value = 0 mm)]),
(Ground.ipt,[Ground_length (value = 160 mm), d2 (value = 80 mm)])
I have tried to use the xml.etree.cElementTree library with this code
from xml.etree import cElementTree
import numpy as np
workingDir = "C:/Users/Vince/Test"
newStrWorkingDir = str.replace(workingDir, '/', '\\')
tree = cElementTree.parse(newStrWorkingDir + "\\test.xml")
root = tree.getroot()
tab = np.empty(shape=(0, 0))
tabEntry = np.empty(shape=(0, 0))
tabScalar = np.empty(shape=(0, 0))
for entry in root.findall('Entry'):
entryNames = entry.findall("./DisplayName")
entryNamesText = entry.find("./DisplayName").text
tabEntry = np.append(tabEntry,entryNamesText)
for scalar in entry.findall('Scalar'):
scalarNames = scalar.findall("./DisplayName")
scalarNamesText = scalar.find("./DisplayName").text
tabScalar = np.append(tabScalar,scalarNamesText)
tab = np.append(tab,(entryNamesText,scalarNamesText))
print(tab)
But it outputs me this
['Assembly.iam' 'd0 (value = 0 mm)'
'Assembly.iam' 'd1 (value = 0 mm)'
'Ground.ipt' 'Ground_length (value = 160 mm)'
'Ground.ipt' 'd2 (value = 80 mm)']

To get your wanted structure, you have to build lists of lists:
import os
from xml.etree import cElementTree
workingDir = "C:\\Users\\Vince\\Test"
tree = cElementTree.parse(os.path.join(newStrWorkingDir, "test.xml"))
root = tree.getroot()
tab = []
for entry in root.findall('Entry'):
entry_name = entry.findtext("./DisplayName")
scalar_names = [e.text for e in entry.findall('Scalar/DisplayName')]
tab.append((entry_name, scalar_names))
print(tab)

How to merge 2 xml files with namespaces

I am trying to merge two XML files using ElementTree module. Following are the XMLs:
a.xml:
<?xml version="1.0"?>
<ListOrdersResponse xmlns="https://mws.amazonservices.com/Orders/2013-09-01">
<ListOrdersResult>
<NextToken>token</NextToken>
<CreatedBefore>2014-10-07T08:13:11Z</CreatedBefore>
<Orders>
<Order>
<AmazonOrderId>12345</AmazonOrderId>
<SellerOrderId>R12345</SellerOrderId>
<PurchaseDate>2014-10-02T14:40:37Z</PurchaseDate>
<LastUpdateDate>2014-10-03T09:47:02Z</LastUpdateDate>
<OrderStatus>Shipped</OrderStatus>
<FulfillmentChannel>MFN</FulfillmentChannel>
<SalesChannel>Amazon.in</SalesChannel>
<ShipServiceLevel>IN Exp Dom 2</ShipServiceLevel>
<ShippingAddress>
<Name>name</Name>
<AddressLine1>line1</AddressLine1>
<AddressLine2>line2</AddressLine2>
<City>Pune</City>
<StateOrRegion>Maharashtra</StateOrRegion>
<PostalCode>411027</PostalCode>
<CountryCode>IN</CountryCode>
<Phone>123456789</Phone>
</ShippingAddress>
<OrderTotal>
<CurrencyCode>INR</CurrencyCode>
<Amount>520.00</Amount>
</OrderTotal>
<NumberOfItemsShipped>1</NumberOfItemsShipped>
<NumberOfItemsUnshipped>0</NumberOfItemsUnshipped>
<PaymentExecutionDetail/>
<PaymentMethod>Other</PaymentMethod>
<MarketplaceId>mid</MarketplaceId>
<BuyerEmail>email#buyer.com</BuyerEmail>
<BuyerName>name</BuyerName>
<ShipmentServiceLevelCategory>Expedited</ShipmentServiceLevelCategory>
<ShippedByAmazonTFM>false</ShippedByAmazonTFM>
<TFMShipmentStatus>Delivered</TFMShipmentStatus>
<OrderType>StandardOrder</OrderType>
<EarliestShipDate>2014-10-05T18:30:00Z</EarliestShipDate>
<LatestShipDate>2014-10-07T18:29:59Z</LatestShipDate>
<EarliestDeliveryDate>2014-10-07T18:30:00Z</EarliestDeliveryDate>
<LatestDeliveryDate>2014-10-11T18:29:59Z</LatestDeliveryDate>
</Order>
</Orders>
</ListOrdersResult>
</ListOrdersResponse>
b.xml:
<?xml version="1.0"?>
<ListOrdersByNextTokenResponse xmlns="https://mws.amazonservices.com/Orders/2013-09-01">
<ListOrdersByNextTokenResult>
<NextToken>token1</NextToken>
<CreatedBefore>2014-10-07T08:13:11Z</CreatedBefore>
<Orders>
<Order>
<AmazonOrderId>oid1</AmazonOrderId>
<PurchaseDate>2014-10-04T13:37:41Z</PurchaseDate>
<LastUpdateDate>2014-10-06T09:52:21Z</LastUpdateDate>
<OrderStatus>Shipped</OrderStatus>
<FulfillmentChannel>MFN</FulfillmentChannel>
<SalesChannel>Amazon.in</SalesChannel>
<ShipServiceLevel>IN Std Dom 2_50k_cod</ShipServiceLevel>
<ShippingAddress>
<Name>name1</Name>
<AddressLine1>line1-1</AddressLine1>
<AddressLine2>line2-1</AddressLine2>
<City>WADHVANCITY,SURENDRANAGAR</City>
<StateOrRegion>Gujarat</StateOrRegion>
<PostalCode>363035</PostalCode>
<CountryCode>IN</CountryCode>
<Phone>987654321</Phone>
</ShippingAddress>
<OrderTotal>
<CurrencyCode>INR</CurrencyCode>
<Amount>242.00</Amount>
</OrderTotal>
<NumberOfItemsShipped>1</NumberOfItemsShipped>
<NumberOfItemsUnshipped>0</NumberOfItemsUnshipped>
<PaymentExecutionDetail/>
<PaymentMethod>Other</PaymentMethod>
<MarketplaceId>mid1</MarketplaceId>
<BuyerEmail>email1#buyer.com</BuyerEmail>
<BuyerName>name1</BuyerName>
<ShipmentServiceLevelCategory>Standard</ShipmentServiceLevelCategory>
<ShippedByAmazonTFM>false</ShippedByAmazonTFM>
<TFMShipmentStatus>PendingPickUp</TFMShipmentStatus>
<OrderType>StandardOrder</OrderType>
<EarliestShipDate>2014-10-05T18:30:00Z</EarliestShipDate>
<LatestShipDate>2014-10-07T18:29:59Z</LatestShipDate>
<EarliestDeliveryDate>2014-10-09T18:30:00Z</EarliestDeliveryDate>
<LatestDeliveryDate>2014-10-15T18:29:59Z</LatestDeliveryDate>
</Order>
</Orders>
</ListOrdersByNextTokenResult>
</ListOrdersByNextTokenResponse>
I want to add the elements inside Orders elemnt in b.xml to that of a.xml
So, the expected output is:
<?xml version="1.0"?>
<ListOrdersResponse xmlns="https://mws.amazonservices.com/Orders/2013-09-01">
<ListOrdersResult>
<NextToken>token</NextToken>
<CreatedBefore>2014-10-07T08:13:11Z</CreatedBefore>
<Orders>
<Order>
<AmazonOrderId>12345</AmazonOrderId>
<SellerOrderId>R12345</SellerOrderId>
<PurchaseDate>2014-10-02T14:40:37Z</PurchaseDate>
<LastUpdateDate>2014-10-03T09:47:02Z</LastUpdateDate>
<OrderStatus>Shipped</OrderStatus>
<FulfillmentChannel>MFN</FulfillmentChannel>
<SalesChannel>Amazon.in</SalesChannel>
<ShipServiceLevel>IN Exp Dom 2</ShipServiceLevel>
<ShippingAddress>
<Name>name</Name>
<AddressLine1>line1</AddressLine1>
<AddressLine2>line2</AddressLine2>
<City>Pune</City>
<StateOrRegion>Maharashtra</StateOrRegion>
<PostalCode>411027</PostalCode>
<CountryCode>IN</CountryCode>
<Phone>123456789</Phone>
</ShippingAddress>
<OrderTotal>
<CurrencyCode>INR</CurrencyCode>
<Amount>520.00</Amount>
</OrderTotal>
<NumberOfItemsShipped>1</NumberOfItemsShipped>
<NumberOfItemsUnshipped>0</NumberOfItemsUnshipped>
<PaymentExecutionDetail/>
<PaymentMethod>Other</PaymentMethod>
<MarketplaceId>mid</MarketplaceId>
<BuyerEmail>email#buyer.com</BuyerEmail>
<BuyerName>name</BuyerName>
<ShipmentServiceLevelCategory>Expedited</ShipmentServiceLevelCategory>
<ShippedByAmazonTFM>false</ShippedByAmazonTFM>
<TFMShipmentStatus>Delivered</TFMShipmentStatus>
<OrderType>StandardOrder</OrderType>
<EarliestShipDate>2014-10-05T18:30:00Z</EarliestShipDate>
<LatestShipDate>2014-10-07T18:29:59Z</LatestShipDate>
<EarliestDeliveryDate>2014-10-07T18:30:00Z</EarliestDeliveryDate>
<LatestDeliveryDate>2014-10-11T18:29:59Z</LatestDeliveryDate>
</Order>
<Order>
<AmazonOrderId>oid1</AmazonOrderId>
<PurchaseDate>2014-10-04T13:37:41Z</PurchaseDate>
<LastUpdateDate>2014-10-06T09:52:21Z</LastUpdateDate>
<OrderStatus>Shipped</OrderStatus>
<FulfillmentChannel>MFN</FulfillmentChannel>
<SalesChannel>Amazon.in</SalesChannel>
<ShipServiceLevel>IN Std Dom 2_50k_cod</ShipServiceLevel>
<ShippingAddress>
<Name>name1</Name>
<AddressLine1>line1-1</AddressLine1>
<AddressLine2>line2-1</AddressLine2>
<City>WADHVANCITY,SURENDRANAGAR</City>
<StateOrRegion>Gujarat</StateOrRegion>
<PostalCode>363035</PostalCode>
<CountryCode>IN</CountryCode>
<Phone>987654321</Phone>
</ShippingAddress>
<OrderTotal>
<CurrencyCode>INR</CurrencyCode>
<Amount>242.00</Amount>
</OrderTotal>
<NumberOfItemsShipped>1</NumberOfItemsShipped>
<NumberOfItemsUnshipped>0</NumberOfItemsUnshipped>
<PaymentExecutionDetail/>
<PaymentMethod>Other</PaymentMethod>
<MarketplaceId>mid1</MarketplaceId>
<BuyerEmail>email1#buyer.com</BuyerEmail>
<BuyerName>name1</BuyerName>
<ShipmentServiceLevelCategory>Standard</ShipmentServiceLevelCategory>
<ShippedByAmazonTFM>false</ShippedByAmazonTFM>
<TFMShipmentStatus>PendingPickUp</TFMShipmentStatus>
<OrderType>StandardOrder</OrderType>
<EarliestShipDate>2014-10-05T18:30:00Z</EarliestShipDate>
<LatestShipDate>2014-10-07T18:29:59Z</LatestShipDate>
<EarliestDeliveryDate>2014-10-09T18:30:00Z</EarliestDeliveryDate>
<LatestDeliveryDate>2014-10-15T18:29:59Z</LatestDeliveryDate>
</Order>
</Orders>
</ListOrdersResult>
</ListOrdersResponse>
I tried:
import xml.etree.ElementTree as ET
import os
import shlex
import subprocess
tree = ET.parse("a.xml")
root = tree.getroot()
combined_xml = root
namespaces = {'resp': 'https://mws.amazonservices.com/Orders/2013-09-01'}
results = combined_xml.find("resp:ListOrdersResult", namespaces=namespaces)
insertion_point = results.find("resp:Orders", namespaces=namespaces)
tree1 = ET.parse("b.xml")
root1 = tree1.getroot()
results1 = root1.find("resp:ListOrdersByNextTokenResult", namespaces=namespaces)
order_array1 = results1.find("resp:Orders", namespaces=namespaces)
for order in order_array1:
insertion_point.extend(order)
print ET.tostring(combined_xml)
But I am getting the following output:
<ns0:ListOrdersResponse xmlns:ns0="https://mws.amazonservices.com/Orders/2013-09-01">
<ns0:ListOrdersResult>
<ns0:NextToken>token</ns0:NextToken>
<ns0:CreatedBefore>2014-10-07T08:13:11Z</ns0:CreatedBefore>
<ns0:Orders>
<ns0:Order>
<ns0:AmazonOrderId>12345</ns0:AmazonOrderId>
<ns0:SellerOrderId>R12345</ns0:SellerOrderId>
<ns0:PurchaseDate>2014-10-02T14:40:37Z</ns0:PurchaseDate>
<ns0:LastUpdateDate>2014-10-03T09:47:02Z</ns0:LastUpdateDate>
<ns0:OrderStatus>Shipped</ns0:OrderStatus>
<ns0:FulfillmentChannel>MFN</ns0:FulfillmentChannel>
<ns0:SalesChannel>Amazon.in</ns0:SalesChannel>
<ns0:ShipServiceLevel>IN Exp Dom 2</ns0:ShipServiceLevel>
<ns0:ShippingAddress>
<ns0:Name>name</ns0:Name>
<ns0:AddressLine1>line1</ns0:AddressLine1>
<ns0:AddressLine2>line2</ns0:AddressLine2>
<ns0:City>Pune</ns0:City>
<ns0:StateOrRegion>Maharashtra</ns0:StateOrRegion>
<ns0:PostalCode>411027</ns0:PostalCode>
<ns0:CountryCode>IN</ns0:CountryCode>
<ns0:Phone>123456789</ns0:Phone>
</ns0:ShippingAddress>
<ns0:OrderTotal>
<ns0:CurrencyCode>INR</ns0:CurrencyCode>
<ns0:Amount>520.00</ns0:Amount>
</ns0:OrderTotal>
<ns0:NumberOfItemsShipped>1</ns0:NumberOfItemsShipped>
<ns0:NumberOfItemsUnshipped>0</ns0:NumberOfItemsUnshipped>
<ns0:PaymentExecutionDetail />
<ns0:PaymentMethod>Other</ns0:PaymentMethod>
<ns0:MarketplaceId>mid</ns0:MarketplaceId>
<ns0:BuyerEmail>email#buyer.com</ns0:BuyerEmail>
<ns0:BuyerName>name</ns0:BuyerName>
<ns0:ShipmentServiceLevelCategory>Expedited</ns0:ShipmentServiceLevelCategory>
<ns0:ShippedByAmazonTFM>false</ns0:ShippedByAmazonTFM>
<ns0:TFMShipmentStatus>Delivered</ns0:TFMShipmentStatus>
<ns0:OrderType>StandardOrder</ns0:OrderType>
<ns0:EarliestShipDate>2014-10-05T18:30:00Z</ns0:EarliestShipDate>
<ns0:LatestShipDate>2014-10-07T18:29:59Z</ns0:LatestShipDate>
<ns0:EarliestDeliveryDate>2014-10-07T18:30:00Z</ns0:EarliestDeliveryDate>
<ns0:LatestDeliveryDate>2014-10-11T18:29:59Z</ns0:LatestDeliveryDate>
</ns0:Order>
<ns0:AmazonOrderId>oid1</ns0:AmazonOrderId>
<ns0:PurchaseDate>2014-10-04T13:37:41Z</ns0:PurchaseDate>
<ns0:LastUpdateDate>2014-10-06T09:52:21Z</ns0:LastUpdateDate>
<ns0:OrderStatus>Shipped</ns0:OrderStatus>
<ns0:FulfillmentChannel>MFN</ns0:FulfillmentChannel>
<ns0:SalesChannel>Amazon.in</ns0:SalesChannel>
<ns0:ShipServiceLevel>IN Std Dom 2_50k_cod</ns0:ShipServiceLevel>
<ns0:ShippingAddress>
<ns0:Name>name1</ns0:Name>
<ns0:AddressLine1>line1-1</ns0:AddressLine1>
<ns0:AddressLine2>line2-1</ns0:AddressLine2>
<ns0:City>WADHVANCITY,SURENDRANAGAR</ns0:City>
<ns0:StateOrRegion>Gujarat</ns0:StateOrRegion>
<ns0:PostalCode>363035</ns0:PostalCode>
<ns0:CountryCode>IN</ns0:CountryCode>
<ns0:Phone>987654321</ns0:Phone>
</ns0:ShippingAddress>
<ns0:OrderTotal>
<ns0:CurrencyCode>INR</ns0:CurrencyCode>
<ns0:Amount>242.00</ns0:Amount>
</ns0:OrderTotal>
<ns0:NumberOfItemsShipped>1</ns0:NumberOfItemsShipped>
<ns0:NumberOfItemsUnshipped>0</ns0:NumberOfItemsUnshipped>
<ns0:PaymentExecutionDetail />
<ns0:PaymentMethod>Other</ns0:PaymentMethod>
<ns0:MarketplaceId>mid1</ns0:MarketplaceId>
<ns0:BuyerEmail>email1#byer.com</ns0:BuyerEmail>
<ns0:BuyerName>name1</ns0:BuyerName>
<ns0:ShipmentServiceLevelCategory>Standard</ns0:ShipmentServiceLevelCategory>
<ns0:ShippedByAmazonTFM>false</ns0:ShippedByAmazonTFM>
<ns0:TFMShipmentStatus>PendingPickUp</ns0:TFMShipmentStatus>
<ns0:OrderType>StandardOrder</ns0:OrderType>
<ns0:EarliestShipDate>2014-10-05T18:30:00Z</ns0:EarliestShipDate>
<ns0:LatestShipDate>2014-10-07T18:29:59Z</ns0:LatestShipDate>
<ns0:EarliestDeliveryDate>2014-10-09T18:30:00Z</ns0:EarliestDeliveryDate>
<ns0:LatestDeliveryDate>2014-10-15T18:29:59Z</ns0:LatestDeliveryDate>
</ns0:Orders>
</ns0:ListOrdersResult>
</ns0:ListOrdersResponse>
Why am I getting ns0? Also, <Order> tag is missing for the second order. How can I get the desired output without ns0. I am ok with suggestions for using another module if it makes life easier.:)
Thanks

ns0 means 'namespace 0' - it's a result of your namespace dict and "resp:tagname" terms.
I'd really recommend using beautifulsoup4 for this, though - it's much nicer for working with xml:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('a.xml'))
insertion_point = soup.listordersresult.orders
orders_b = BeautifulSoup(open('b.xml')).listordersbynexttokenresult.orders
# could probably just be orders_b = BeautifulSoup(open('b.xml'))
orders_to_insert = orders_b.find_all('order')
for order in orders_to_insert:
insertion_point.append(order)
print(soup)

import xml.etree.ElementTree as ET
from StringIO import StringIO
namespaces = {'resp': 'https://mws.amazonservices.com/Orders/2013-09-01'}
tree = ET.parse("a.xml")
root = tree.getroot()
results = root.find("resp:ListOrdersResult", namespaces=namespaces)
order_array = results.find("resp:Orders", namespaces=namespaces).getchildren()
tree1 = ET.parse("b.xml")
root1 = tree1.getroot()
results1 = root1.find("resp:ListOrdersByNextTokenResult", namespaces=namespaces)
order_array1 = results1.find("resp:Orders", namespaces=namespaces).getchildren()
for order in order_array1:
order_array.append(order)
tree.write("temp.xml")
correct_data = open("temp.xml").read().replace('ns0:', '').replace(':ns0','')
filewrite = open("combined.xml", 'w')
filewrite.write(correct_data)
filewrite.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parse xml with python getting x,y values - python

Related

Txt to XML Python parser

Merge output loop lxml

Create a dataframe from nested xml and generate a csv

XML file parsing - Get data from each parent and their own children

How to merge 2 xml files with namespaces

Categories

Resources