Copy a node from one xml file to another using lxml

Copy a node from one xml file to another using lxml - python

I'm trying to find the simplest way of copying one node to another XML file. Both files will contain the same node - just the contents of that node will be different.
In the past I've done some crazy copying of each element and subelement - but there has to be a better way..
#Master XML
parser = etree.XMLParser(strip_cdata=False)
tree = etree.parse('file1.xml', parser)
# Find the //input node - which has a lot of subelems
inputMaster= tree.xpath('//input')[0]
#Dest XML -
parser2 = etree.XMLParser(strip_cdata=False)
tree2 = etree.parse('file2.xml', parser2)
# this won't work but.. it would be nice
etree.SubElement(tree2,'input') = inputMaster

Here's one way - its not brilliant as it loses the position (i.e. it pops the node at the end) but hey..
def getMaster(somefile):
parser = etree.XMLParser(strip_cdata=False)
tree = etree.parse(somefile, parser)
doc = tree.getroot()
inputMaster = doc.find('input')
return inputMaster
inputXML = getMaster('master_file.xml')
parser = etree.XMLParser(strip_cdata=False)
tree = etree.parse('file_to_copy_node_to.xml', parser)
doc = tree.getroot()
doc.remove(doc.find('input'))
doc.append(inputXML)
# Now write it
newxml = etree.tostring(tree, pretty_print=True)
f = open('file_to_copy_node_to.xml', 'w')
f.write(newxml)
f.close()

Related

Python - multiple logs/structures in the same xml file using ElementTree

I wrote an xml file with ElementTree. The problem i'm facing is that I want to write multiple logs in the same xml file.
The code:
import xml.etree.cElementTree as ET
#XML
root = ET.Element('LOG')
DATE = ET.SubElement(root, 'DATE')
DATE.text = "child_1"
TIME = ET.SubElement(root, 'TIME')
TIME.text = "child_2"
CC = ET.SubElement(root, 'CC')
CC.text = "child_3"
AMOUNT = ET.SubElement(root, 'AMOUNT')
AMOUNT.text = "child_4"
tree = ET.ElementTree(root)
#Generating XML
tree.write("file_name.xlm")
#Print
print(open("file_name.xlm").read())
Current output example:
<LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG>
Output that I want:
<LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG><LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG><LOG><DATE>child_1</DATE><TIME>child_2</TIME><CC>child_3</CC><AMOUNT>child_4</AMOUNT></LOG>...

What you want is just not how xml works. xml documents have one root node, and just one. The closest you can get is by wrapping your <LOG> entries in a container:
<ALL_LOGS>
<LOG>....</LOG>
<LOG>....</LOG>
</ALL_LOGS>

lxml give ROOT name attribute and xml file a version

EDIT****Other aim:
And I would love to iterate through every excel row and save each row as separate .xml file (filename = invoice.text)
any help appreciated
->>> the problem is, that the 2nd created .xml file has also data from the first row inside. Can anybody help me? Highly appreciated
help is appreciated, I want to give ROOT name attributes and the xml a version "" and save each excel row as a separate .xml file
I already setup the excel with openpyxl.
EDIT
Code edited
from lxml import etree
import openpyxl
# Create root element with namespace information
xmlns = "http://xml.datev.de/bedi/tps/ledger/v040"
xsi = "http://www.w3.org/2001/XMLSchema-instance"
schemaLocation = "http://xml.datev.de/bedi/tps/ledger/v040 Belegverwaltung_online_ledger_import_v040.xsd"
version = "4.0"
generator_info = "DATEV Musterdaten"
generating_system = "DATEV manuell"
xmlRoot = etree.Element(
"{" + xmlns + "}LedgerImport",
version=version,
attrib={"{" + xsi + "}schemaLocation": schemaLocation},
generator_info=generator_info,
generating_system=generating_system,
nsmap={'xsi': xsi, None: xmlns}
)
####open excel file speadsheet
wb = openpyxl.load_workbook('import_spendesk_datev.xlsx')
sheet = wb['Import']
# build the xml tree
for i in range(2,6):
#consolidate = etree.SubElement(xmlRoot, 'consolidate', attrib={'consolidatedAmount': str(sheet.cell(row=i,column=16).value, 'consolidatedDate': str(sheet.cell(row=i,column=2).value, 'consolidatedInvoiceId': str(sheet.cell(row=i,column=13).value, 'consolidatedCurrencyCode': str(sheet.cell(row=i,column=12).value )})
consolidate = etree.SubElement(xmlRoot, 'consolidate', attrib={'consolidatedAmount': str(sheet.cell(row=i,column=16).value),'consolidatedDate': str(sheet.cell(row=i,column=2).value), 'consolidatedInvoiceId': str(sheet.cell(row=i,column=13).value), 'consolidatedCurrencyCode': str(sheet.cell(row=i,column=12).value) })
accountsPayableLedger = etree.SubElement(consolidate, 'accountsPayableLedger')
account = etree.SubElement(accountsPayableLedger, 'bookingText')
account.text = sheet.cell(row=i,column=21).value
invoice = etree.SubElement(accountsPayableLedger, 'invoiceId')
invoice.text = sheet.cell(row=i,column=13).value
date = etree.SubElement(accountsPayableLedger, 'date')
date.text = sheet.cell(row=i,column=2).value
amount = etree.SubElement(accountsPayableLedger, 'amount')
amount.text = sheet.cell(row=i,column=16).value
account_no = etree.SubElement(accountsPayableLedger, 'accountNo')
account_no.text = sheet.cell(row=i,column=19).value
cost1 = etree.SubElement(accountsPayableLedger, 'costCategoryId')
cost1.text = sheet.cell(row=i,column=15).value
currency_code = etree.SubElement(accountsPayableLedger, 'currencyCode')
currency_code.text = sheet.cell(row=i,column=12).value
party_id = etree.SubElement(accountsPayableLedger, 'partyId')
party_id.text = sheet.cell(row=i,column=20).value
bpaccount = etree.SubElement(accountsPayableLedger, 'bpAccountNo')
bpaccount.text = sheet.cell(row=i,column=20).value
#doc = etree.ElementTree(xmlRoot)
#doc.write( str(sheet.cell(row=i,column=13).value)+".xml", xml_declaration=True, encoding='utf-8', pretty_print=True)
doc = etree.ElementTree(xmlRoot)
with open(str(sheet.cell(row=i,column=13).value)+".xml", 'w') as f:
f.write(etree.tostring(doc, pretty_print=True, xml_declaration=True, encoding='utf-8').decode('utf-8'))
# doc = etree.ElementTree(xmlRoot)
# with open("test1337.xml", 'w') as f:
# f.write(etree.tostring(doc, pretty_print=True, xml_declaration=True, encoding='utf-8').decode('utf-8'))
# convert into elementtree and write it directly into a file
#doc = etree.ElementTree(xmlRoot)
#outFile = open("test1337.xml", 'w')
#doc.write("test1337.xml", xml_declaration=True, encoding='utf-8', pretty_print=True)
#doc.close()
Please help sitting like hours on that.
Thanks so much

I recommend using etree elements and subelements and convert them later on into a element tree. This provides more flexibility while creating the xml, especially when you want to iterate over an existing data structure:
from lxml import etree
# Create root element with namespace information
xmlns = "http://xml.datev.de/bedi/tps/ledger/v040"
xsi = "http://www.w3.org/2001/XMLSchema-instance"
schemaLocation = "http://xml.datev.de/bedi/tps/ledger/v040 Belegverwaltung_online_ledger_import_v040.xsd"
version = "4.0"
generator_info = "DATEV Musterdaten"
generating_system = "DATEV manuell"
xmlRoot = etree.Element(
"{" + xmlns + "}LedgerImport",
version=version,
attrib={"{" + xsi + "}schemaLocation": schemaLocation},
generator_info=generator_info,
generating_system=generating_system,
nsmap={'xsi': xsi, None: xmlns}
)
# build the xml tree
consolidate = etree.SubElement(xmlRoot, 'consolidate', attrib={'consolidatedAmount': "1337.01"})
accountsPayableLedger = etree.SubElement(consolidate, 'accountsPayableLedger')
account = etree.SubElement(accountsPayableLedger, 'bookingText')
account.text = 'amazon'
invoice = etree.SubElement(accountsPayableLedger, 'invoiceId')
invoice.text = "1"
# convert into elementtree and write it directly into a file
doc = etree.ElementTree(xmlRoot)
with open("test1337.xml", 'w') as f:
f.write(etree.tostring(doc, pretty_print=True, xml_declaration=True, encoding='utf-8').decode('utf-8'))
The generated file looks like this:
<?xml version='1.0' encoding='UTF-8'?>
<LedgerImport xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://xml.datev.de/bedi/tps/ledger/v040" generating_system="DATEV manuell" generator_info="DATEV Musterdaten" version="4.0" xsi:schemaLocation="http://xml.datev.de/bedi/tps/ledger/v040 Belegverwaltung_online_ledger_import_v040.xsd">
<consolidate consolidatedAmount="1337.01">
<accountsPayableLedger>
<bookingText>amazon</bookingText>
<invoiceId>1</invoiceId>
</accountsPayableLedger>
</consolidate>
</LedgerImport>

case insensitive xml and python

I got this piece of code and I am trying to read all the 'ref' 'href' tags. I am not sure how to make this to be case insensitive as some of my xml files have REF or Ref or ref.
Any suggestions?
f = urllib.urlopen(url)
tree = ET.parse(f)
root = tree.getroot()
for child in root.iter('ref'):
t = child.get('href')
if t not in self.href:
self.href.append(t)
print self.href[-1]

You can normalize tags and attributes by converting them to lowercase using the functions below as a step of preprocessing:
import xml.etree.ElementTree as ET
f = urllib.urlopen(url)
tree = ET.parse(f)
root = tree.getroot()
def normalize_tags(root):
root.tag = root.tag.lower()
for child in root:
normalize_tags(child)
def normalize_attr(root):
for attr,value in root.attrib.items():
norm_attr = attr.lower()
if norm_attr != attr:
root.set(norm_attr,value)
root.attrib.pop(attr)
for child in root:
normalize_attr(child)
normalize_tags(root)
normalize_attr(root)
print(ET.tostring(root))

The following should help
f = urllib.urlopen(url)
tree = ET.parse(f)
root = tree.getroot()
for child in root:
if child.tag.lower() == 'ref':
t = child.attribute.get('href')
if t not in self.href:
self.href.append(t)
print self.href[-1]

If you are using lxml then one option is to use XPath with regular expressions through XSLT extensions (https://stackoverflow.com/a/2756994/2997179):
root.xpath("./*[re:test(local-name(), '(?i)href')]",
namespaces={"re": "http://exslt.org/regular-expressions"})

Changing a specific xml element using Python 3 ElementTree

I have a set of metadata files in xml which are updated regularly and I'm trying to automate.
I've worked out how to itteratively find and then replace text in the desired element of the xml but thought there must be a direct way to access and change the element. I just can't work it out.
The metadata xml is formatted:
<?xml version="1.0" ?>
<metadata xml:lang="en">
<Esri>
<CreaDate>20120405</CreaDate>
<CreaTime>13113000</CreaTime>
<ArcGISFormat>1.0</ArcGISFormat>
<SyncOnce>TRUE</SyncOnce>
<ModDate>20121129</ModDate>
<ModTime>11433300</ModTime>
<ArcGISProfile>ItemDescription</ArcGISProfile>
</Esri>
<dataIdInfo>
<idPurp>Updated :: 121129_114038</idPurp>
</dataIdInfo>
</metadata>
My iterative approach was:
for child in root:
for xel in child.iter('idPurp'):
download_new_datetime = strftime('%y%m%d_%H%M%S')
download_new_text = 'Downloaded :: '
xel.text = download_new_text + download_new_datetime
tree.write(xmlfile)
Ideas appreciated on a better way.

I would write to the file only once I'm done with the loop:
import xml.etree.ElementTree as ET
from time import strftime
xmlfile = '/tmp/file'
tree = ET.parse(xmlfile)
root = tree.getroot()
for child in root:
for xel in child.iter('idPurp'):
download_new_datetime = strftime('%y%m%d_%H%M%S')
download_new_text = 'Downloaded :: '
xel.text = download_new_text + download_new_datetime
tree.write(xmlfile)
I would even simplify that loop further to:
for child in root:
for xel in child.iter('idPurp'):
xel.text = 'Downloaded :: ' + time.strftime('%y%m%d_%H%M%S')

Two simpler ways, both work, tested.
First:
import xml.etree.ElementTree as ET
from time import strftime
xmlfile = 'metadata.xml'
tree = ET.parse(xmlfile)
root = tree.getroot()
xel = root.find('./dataIdInfo/idPurp')
xel.text = 'Downloaded :: ' + strftime('%y%m%d_%H%M%S')
tree.write(xmlfile)
Second:
import xml.etree.ElementTree as ET
from time import strftime
xmlfile = 'metadata.xml'
tree = ET.parse(xmlfile)
root = tree.getroot()
xel = root[1][0]
xel.text = 'Downloaded :: ' + strftime('%y%m%d_%H%M%S')
tree.write(xmlfile)
I prefer the first one, it's more readable in my opinion.

Code being dropped from xml created using python

I am copying and then updating a metadata xml file using python -this works fine except that the following code from the original metafile is being deleted
<?xml version="1.0" encoding="utf-8"?><?xml-stylesheet type='text/xsl' href='ANZMeta.xsl'?>
It needs to go at the start of the file.
The answer for this in PHP is # xml insertion at specific point of xml file but I need a solution for Python.
The code and full explanation is in my original post but I am seperating this question as it is different from the original issues I had. Search and replace multiple lines in xml/text files using python
Thanks,
FULL CODE
import os, xml, arcpy, shutil, datetime, Tkinter, tkFileDialog, tkSimpleDialog
from xml.etree import ElementTree as et
path=os.getcwd()
RootDirectory=path
currentPath=path
arcpy.env.workspace = path
Count=0
DECLARATION = """<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet type='text/xsl' href='ANZMeta.xsl'?>\n"""
Generated_XMLs=RootDirectory+'\GeneratedXML_LOG.txt'
f = open(Generated_XMLs, 'a')
f.write("Log of Metadata Creation Process - Update: "+str(datetime.datetime.now())+"\n")
f.close()
for root, dirs, files in os.walk(RootDirectory, topdown=False):
#print root, dirs
for directory in dirs:
try:
currentPath=os.path.join(root,directory)
except:
pass
os.chdir(currentPath)
arcpy.env.workspace = currentPath
print currentPath
#def Create_xml(currentPath):
FileList = arcpy.ListFeatureClasses()
zone="_Zone"
for File in FileList:
Count+=1
FileDesc_obj = arcpy.Describe(File)
FileNm=FileDesc_obj.file
check_meta=os.listdir(currentPath)
existingXML=FileNm[:FileNm.find('.')]
existingExtension=FileNm[FileNm.find('.'):]
print "XML: "+existingXML
#print check_meta
#if existingXML+'.xml' in check_meta:
#newMetaFile='new'
for f in check_meta:
if f.startswith(existingXML) and f.endswith('.xml'):
print "exists, file name:", f
newMetaFile=FileNm+"_2012Metadata.xml"
try:
shutil.copy2(f, newMetaFile)
except:
pass
break
else:
#print "Does not exist"
newMetaFile=FileNm+"_BaseMetadata.xml"
print "New meta file: "+newMetaFile+ " for: "+File
if newMetaFile.endswith('_BaseMetadata.xml'):
print "calling tkinter"
root = Tkinter.Tk()
root.withdraw()
file = tkFileDialog.askopenfile(parent=root,mode='rb',title='Choose a xml base file to match with: '+File)
if file != None:
metafile=os.path.abspath(file.name)
file.close()
#print metafile
shutil.copy2(metafile,newMetaFile)
print "copied"+metafile
root.destroy
else:
shutil.copy2('L:\Data_Admin\QA\Metadata_python_toolset\Master_Metadata.xml', newMetaFile)
#root = Tkinter.Tk()
#root.withdraw()
#newTitle=tkSimpleDialog.askstring('title', 'prompt')
#root.destroy
#print newTitle
print "Parsing meta file: "+newMetaFile
tree=et.parse(newMetaFile)
print "Processing: "+str(File)
for node in tree.findall('.//title'):
node.text = str(FileNm)
for node in tree.findall('.//procstep/srcused'):
node.text = str(currentPath+"\\"+existingXML+".xml")
dt=dt=str(datetime.datetime.now())
for node in tree.findall('.//procstep/date'):
node.text = str(dt[:10])
for node in tree.findall('.//procstep/time'):
node.text = str(dt[11:13]+dt[16:19])
for node in tree.findall('.//metd/date'):
node.text = str(dt[:10])
for node in tree.findall('.//northbc'):
node.text = str(FileDesc_obj.extent.YMax)
for node in tree.findall('.//southbc'):
node.text = str(FileDesc_obj.extent.YMin)
for node in tree.findall('.//westbc'):
node.text = str(FileDesc_obj.extent.XMin)
for node in tree.findall('.//eastbc'):
node.text = str(FileDesc_obj.extent.XMax)
for node in tree.findall('.//native/nondig/formname'):
node.text = str(os.getcwd()+"\\"+File)
for node in tree.findall('.//native/digform/formname'):
node.text = str(FileDesc_obj.featureType)
for node in tree.findall('.//avlform/nondig/formname'):
node.text = str(FileDesc_obj.extension)
for node in tree.findall('.//avlform/digform/formname'):
node.text = str(float(os.path.getsize(File))/int(1024))+" KB"
for node in tree.findall('.//theme'):
node.text = str(FileDesc_obj.spatialReference.name +" ; EPSG: "+str(FileDesc_obj.spatialReference.factoryCode))
print node.text
projection_info=[]
Zone=FileDesc_obj.spatialReference.name
if "GCS" in str(FileDesc_obj.spatialReference.name):
projection_info=[FileDesc_obj.spatialReference.GCSName, FileDesc_obj.spatialReference.angularUnitName, FileDesc_obj.spatialReference.datumName, FileDesc_obj.spatialReference.spheroidName]
print "Geographic Coordinate system"
else:
projection_info=[FileDesc_obj.spatialReference.datumName, FileDesc_obj.spatialReference.spheroidName, FileDesc_obj.spatialReference.angularUnitName, Zone[Zone.rfind(zone)-3:]]
print "Projected Coordinate system"
x=0
for node in tree.findall('.//spdom'):
for node2 in node.findall('.//keyword'):
#print node2.text
node2.text = str(projection_info[x])
#print node2.text
x=x+1
tree.write(newMetaFile)
with open(newMetaFile, 'w') as output: # would be better to write to temp file and rename
output.write(DECLARATION)
tree.write(output, xml_declaration=False, encoding='utf-8')
# xml_declaration=False - don't write default declaration
f = open(Generated_XMLs, 'a')
f.write(str(Count)+": "+File+"; "+newMetaFile+"; "+currentPath+";"+existingXML+"\n")
f.close()
# Create_xml(currentPath)
Error message from Wing IDE
xml.parsers.expat.ExpatError: no element found: line 3, column 0 File
"L:\Data_Admin\QA\Metadata_python_toolset\test2\update_Metadata1f.py",
line 78, in tree=et.parse(newMetaFile) File
"C:\Python26\ArcGIS10.0\Lib\xml\etree\ElementTree.py", line 862, in
parse tree.parse(source, parser) File
"C:\Python26\ArcGIS10.0\Lib\xml\etree\ElementTree.py", line 587, in
parse self._root = parser.close() File
"C:\Python26\ArcGIS10.0\Lib\xml\etree\ElementTree.py", line 1254, in
close self._parser.Parse("", 1) # end of data

I struggled with adding PI's to the start of an ElementTree document too. I came up with a solution using a fake root node (with None as the element tag) to hold any required processing instructions and then the real document root node.
import xml.etree.ElementTree as ET
# Build your XML document as normal...
root = ET.Element('root')
# Create 'fake' root node
fake_root = ET.Element(None)
# Add desired processing instructions. Repeat as necessary.
pi = ET.PI("xml-stylesheet", "type='text/xsl' href='ANZMeta.xsl'")
pi.tail = "\n"
fake_root.append(pi)
# Add real root as last child of fake root
fake_root.append(root)
# Write to file, using ElementTree.write( ) to generate <?xml ...?> tag.
tree = ET.ElementTree(fake_root)
tree.write("doc.xml", xml_declaration=True)
The resulting doc.xml file:
<?xml version='1.0' encoding='us-ascii'?>
<?xml-stylesheet type='text/xsl' href='ANZMeta.xsl'?>
<root />

If all your xml files have the same declaration, you can write it by yourself:
import xml.etree.ElementTree as ET
DECLARATION = """<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet type='text/xsl' href='ANZMeta.xsl'?>\n"""
tree = ET.parse(filename)
# do some work on tree
with open(filename, 'w') as output: # would be better to write to temp file and rename
output.write(DECLARATION)
tree.write(output, xml_declaration=False, encoding='utf-8')
# xml_declaration=False - don't write default declaration

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Copy a node from one xml file to another using lxml - python

Related

Python - multiple logs/structures in the same xml file using ElementTree

lxml give ROOT name attribute and xml file a version

case insensitive xml and python

Changing a specific xml element using Python 3 ElementTree

Code being dropped from xml created using python

Categories

Resources