python CDATA in subelement - python

I'm trying to add CDATA to an subElement of XML. But the XML keeps converting the < character to <.
I've read some posts here that do something with CDATA, but I can't believe it's this hard to make it work. Besides that, I can't get those examples working.
Below a part of my code simplified, using python 3.4.
import xml.etree.cElementTree as ET
from xml.dom import minidom
class MyClass():
def __init__(self):
self.xml = None
def generateXML(self):
self.xml = ET.Element("AVXML")
row = ET.SubElement(self.xml, "ROW")
ET.SubElement(row, "DATA").text = "<![CDATA[ <ART_HDR.COMMENT1>.PDF ]]>"
def saveXML(self):
rough_string = ET.tostring(self.xml, 'Windows-1252')
reparsed = minidom.parseString(rough_string)
prettyxml = reparsed.toprettyxml(indent=" ", encoding="Windows-1252")
print(prettyxml)
f = open("filetosave.xml", "wb")
f.write(prettyxml)
f.close()
m = MyClass()
m.generateXML()
m.saveXML()
This however generates:
<DATA><![CDATA[ <ART_HDR.COMMENT1>.PDF ]]></DATA>
Instead of
<DATA><![CDATA[ <ART_HDR.COMMENT1>.PDF ]]></DATA>

Oke, I used this comment and got it working.
import xml.etree.cElementTree as ET
from xml.dom import minidom
class MyClass():
def __init__(self):
self.xml = None
def generateXML(self):
self.xml = ET.Element("AVXML")
row = ET.SubElement(self.xml, "ROW")
data = " <ART_HDR.COMMENT1>.PDF "
cdata = ET.SubElement(row, "DATA")
cdata.append(ET.Comment(' --><![CDATA[' + data.replace(']]>', ']]]]><![CDATA[>') + ']]><!-- '))
def saveXML(self):
rough_string = ET.tostring(self.xml, 'Windows-1252')
reparsed = minidom.parseString(rough_string)
prettyxml = reparsed.toprettyxml(indent=" ", encoding="Windows-1252")
f = open("filetosave.xml", "wb")
f.write(prettyxml)
f.close()
m = MyClass()
m.generateXML()
m.saveXML()

Related

How can I write to an HTML file from Python class

I am trying to write to an HTML file from my IDLE Python file. My task is to write all HTML code essentially as one string, using tags that I have made in my HTML object file. I am unsure of how to properly write my 'write()' method into my HTML file.
Here is my HTML object code:
from ReusableCustomerObject import customer
from Milestone2 import*
class HTMLwebpage:
def __init__(self, st):
self.st = st
def htmlopen(self, st):
self.f=open('HTMLMilestone3.html', 'w')
def table(self, st):
__table = "<table style='width: 100%;'; border='1'>"
return table
def tbody(self, st):
__tbody = "<tbody>"
__tbodyc = "</tbody>"
return tbody + str(st) + tbodyc
def tr(self, st):
__tr = "<tr>"
__trc = "</tr>"
return tr + str(st) + trc
def th(self, st):
__th = "<th>"
__thc = "</th>"
return th + str(st) + thc
def td(self, st):
__td = "<td>"
__tdc = "</td>"
return td + str(st) + tdc
def paragraph(self, st):
__para = "<p>"
__parac = "</p>"
return para + str(st) + parac
def h1(self, st):
__h1 = "<h1>"
__h1c = "</h1>"
return h1 + str(st) + h1c
def write(self, st):
st = formatClient()
return self.st.write(str(st))
fh = open('HTMLMilestone3', 'w')
fh.write(write(st))
fh.close()
You need to create a string like the one bellow and send it to the html file you want to create/edit:
message = """<html>
<head></head>
<body><p>Hello World!</p></body>
</html>"""
after you have done this with your class functions, you write the string you created to the file:
f = open('helloworld.html','w') # w if you want to write override or a if you want to write and append
f.write(message)
f.close()
in the end it's gonna look something like that:
from HTMLwebpage import *
message = htmlopen() + h1("Hello") + tr("the tr")
f = open('helloworld.html','w') # w if you want to write override or a if you want to write and append
f.write(message)
f.close()

Values from a XML file

I'm trying to get the values inside a XML, but with this code I only get the label name value, I want the value itself from the label name.
XML Text:
<root><label_params><label_param><name>BranchName</name><value></value></label_param><label_param><name>CustomerCode</name><value></value></label_param><label_param><name>SealNumber</name><value>0110000000420</value></label_param><label_param><name>CustomerName</name><value>PUNTO EDUCATIVO LTDA</value></label_param><label_param><name>LpnTypeCode</name><value>LPN</value></label_param><label_param><name>OutboundNumber</name><value>1685147.1</value></label_param><label_param><name>ReferenceNumber</name><value>18072019_pall_cerr</value></label_param><label_param><name>DeliveryAddress1</name><value>Sin Direccion</value></label_param><label_param><name>NroCita</name><value></value></label_param><label_param><name>FechaEnt</name><value>19/07/2019</value></label_param><label_param><name>Porder</name><value>18072019_pall_cerr</value></label_param><label_param><name>Factura</name><value></value></label_param><label_param><name>IdLpnCode</name><value>C0000000015</value></label_param><label_param><name>TotalBultos</name><value></value></label_param><label_param><name>ANDENWMS</name><value>ANDEN15</value></label_param><label_param><name>LpnPadre</name><value>C0000000015</value></label_param><label_param><name>Cerrados</name><value>4</value></label_param><label_param><name>NoCerrados</name><value>2</value></label_param><label_param><name>TOTALPALLET</name><value></value></label_param></label_params></root>
Python Code
from xml.dom.minidom import parse
doc = parse("DataXML.xml")
my_node_list = doc.getElementsByTagName("name")
my_n_node = my_node_list[0]
my_child = my_n_node.firstChild
my_text = my_child.data
print(my_text)
Here you go:
from xml.dom.minidom import parse
doc = parse("../data/DataXML.xml")
my_node_list = doc.getElementsByTagName("label_param")
for node in my_node_list:
name_node = node.getElementsByTagName("name")
value_node = node.getElementsByTagName("value")
print("Name: "+name_node[0].firstChild.data)
if(value_node[0].firstChild != None):
print("Value: "+value_node[0].firstChild.data)
else:
print("Value: Empty")

XML to CSV but same tags under parent

I have an XML file like that and trying to convert it to CSV with xml2csv python library. But there is a < images > image tag that brokes everything. I want to get all < img_item > tags on different column. How can I achieve that?
Thanks,
<products>
<product>
<code>722</code>
<ws_code>B515C16CRU</ws_code>
<supplier_code>B515C16CRU</supplier_code>
<images>
<img_item type_name="">
https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3378-72-B.jpg
</img_item>
<img_item type_name="">
https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3379-72-B.jpg
</img_item>
<img_item type_name="">
https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3380-72-B.jpg
</img_item>
</images>
</product>
....
</products>
As you might have guessed, the problem is because each product node has multiple img_item tags which xml2csv does not know how to handle (and, going over its documentation, does not seem to have an option to let it know how to handle these nodes).
You can, however, do this quite easily using the builtin csv module. You just need to decide how you want to delimit the different images' urls. In the example below I've decided to use ; (obviously you can't use ,, unless you use another delimiter for the columns).
Also note that I hardcoded the headers. This can be (quite) easily changed so that the headers are dynamically detected from the product node's sub-elements.
import csv
import xml.etree.ElementTree as ET
string = '''<products>
<product>
<code>722</code>
<ws_code>B515C16CRU</ws_code>
<supplier_code>B515C16CRU</supplier_code>
<images>
<img_item type_name="">https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3378-72-B.jpg</img_item>
<img_item type_name="">https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3379-72-B.jpg</img_item>
<img_item type_name="">https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3380-72-B.jpg</img_item>
</images>
</product>
</products>'''
root = ET.fromstring(string)
headers = ('code', 'ws_code', 'supplier_code', 'images')
with open('test.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
for product in root.iter('product'):
writer.writerow({'code': product.find('code').text,
'ws_code': product.find('ws_code').text,
'supplier_code': product.find('supplier_code').text,
'images': ';'.join(img.text for img in product.iter('img_item'))})
Which produces the below CSV:
code,ws_code,supplier_code,images
722,B515C16CRU,B515C16CRU,https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3378-72-B.jpg;https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3379-72-B.jpg;https://www.apparel.com.tr/stance-corap-cruker-grey-orap-stance-ankle-bters-3380-72-B.jpg
import xml.etree.ElementTree as ET
import csv
import re
class xml_to_csv:
def do(self):
#self.xml_file_location = input("Enter full path of XML file(Eg = D:\programs\ResidentData.xml) : ")
self.tree = ET.parse("urunler-fotolu.xml")
self.root = self.tree.getroot()
self.csv_file_location = input("Enter full path to store CSV file(Eg = D:\programs\csv_file.csv ) : ")
self.csv_data = open(self.csv_file_location, 'w')
self.csv_writer = csv.writer(self.csv_data)
self.find_records(self.root)
def find_attributes(self,record):
temp = []
dont_do = 0
for j in record:
temp = temp + self.find_attributes(j)
dont_do = 1
if(dont_do == 0):
return [record.text]
return temp
def find_records(self,root1):
for i in root1:
csv_record = self.find_attributes(i)
sz = len(csv_record)
i=0
while (i<sz):
if csv_record[i][0] == '\n':
csv_record[i] = csv_record[i][1:len(csv_record[i])-1]
i = i+1;
print(csv_record)
self.csv_writer.writerow(csv_record)
if __name__ == "__main__":
obj = xml_to_csv()
obj.do()
Input:
For this = """
<State>
<Resident Id="100">
<Name>Sample Name</Name>
<PhoneNumber>1234567891</PhoneNumber>
<EmailAddress>sample_name#example.com</EmailAddress
<Address>
<StreetLine1>Street Line1</StreetLine1>
<City>City Name</City>
<StateCode>AE</StateCode>
<PostalCode>12345</PostalCode>
</Address>
</Resident>
</State>
"""
Output :
['Sample Name', '1234567891', 'sample_name#example.com', 'Street Line1', 'City Name', 'AE', '12345']

How to use toprettyxml() to give xml tag and text in same line

I have this text file 20150731100543_1.txt
GI-eSTB-MIB-NPH::eSTBGeneralErrorCode.0 = INTEGER: 0
GI-eSTB-MIB-NPH::eSTBGeneralConnectedState.0 = INTEGER: true(1)
GI-eSTB-MIB-NPH::eSTBGeneralPlatformID.0 = INTEGER: 2075
GI-eSTB-MIB-NPH::eSTBMoCAfrequency.0 = INTEGER: 0
GI-eSTB-MIB-NPH::eSTBMoCAMACAddress.0 = STRING: 0:0:0:0:0:0
GI-eSTB-MIB-NPH::eSTBMoCANumberOfNodes.0 = INTEGER: 0
Which I want to convert in xml like below (20150731100543_1.xml)
<?xml version="1.0" encoding="UTF-8"?>
<doc>
<GI-eSTB-MIB-NPH>
<eSTBGeneralErrorCode.0>
INTEGER: 0
</eSTBGeneralErrorCode.0>
</GI-eSTB-MIB-NPH>
<GI-eSTB-MIB-NPH>
<eSTBGeneralConnectedState.0>
INTEGER: true(1)
</eSTBGeneralConnectedState.0>
</GI-eSTB-MIB-NPH>
<GI-eSTB-MIB-NPH>
<eSTBGeneralPlatformID.0>
INTEGER: 2075
</eSTBGeneralPlatformID.0>
</GI-eSTB-MIB-NPH>
<GI-eSTB-MIB-NPH>
<eSTBMoCAfrequency.0>
INTEGER: 0
</eSTBMoCAfrequency.0>
</GI-eSTB-MIB-NPH>
<GI-eSTB-MIB-NPH>
<eSTBMoCAMACAddress.0>
STRING: 0:0:0:0:0:0
</eSTBMoCAMACAddress.0>
</GI-eSTB-MIB-NPH>
<GI-eSTB-MIB-NPH>
<eSTBMoCANumberOfNodes.0>
INTEGER: 0
</eSTBMoCANumberOfNodes.0>
</GI-eSTB-MIB-NPH>
</doc>
I am able get this done using following script:
import sys
import time
import commands
from xml.etree.ElementTree import Element, SubElement
from xml.etree import ElementTree
from xml.dom import minidom
def prettify(elem):
"""Return a pretty-printed XML string for the Element.
"""
rough_string = ElementTree.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ", newl="\n", encoding="UTF-8")
if len(sys.argv) != 2:
print "\nUsage: python script.py <IP>\n";
exit(0)
filename_xml = '20150731100543_1.xml'#filename_xml = temp + ".xml"
print "xml filename is: %s\n" % filename_xml
xml = open(filename_xml, 'w+')
top = Element('doc')
with open('20150731100543_1.txt') as f:
for line in f:
b = line.split(':')
child = SubElement(top, b[0])
c = line.split()
d = c[0].split(':')
property = SubElement(child, d[2])
property.text = c[2] + " " + c[3]
xml.write(prettify(top))
xml.close()
I have three questions here:
Is there any way (using toprettyxml() or something else) I can
change the xml that is being generated to have openeing and closing
tags and text in that tag in same line?
Also can I have
tag only at starting an at the end instead of
having it with every element below it? (as all the elements are
within this same tag)
So if possible the format of xml should be like:
<?xml version="1.0" encoding="UTF-8"?>
<doc>
<GI-eSTB-MIB-NPH>
<eSTBGeneralErrorCode.0>INTEGER: 0</eSTBGeneralErrorCode.0>
<eSTBGeneralConnectedState.0>INTEGER: true(1)</eSTBGeneralConnectedState.0>
<eSTBGeneralPlatformID.0>INTEGER: 2075</eSTBGeneralPlatformID.0>
<eSTBMoCAfrequency.0>INTEGER: 0</eSTBMoCAfrequency.0>
<eSTBMoCAMACAddress.0>STRING: 0:0:0:0:0:0</eSTBMoCAMACAddress.0>
<eSTBMoCANumberOfNodes.0>INTEGER: 0</eSTBMoCANumberOfNodes.0>
</GI-eSTB-MIB-NPH>
</doc>
I am trying for this as this will reduce the number of lines in xml to great extent.
The last and least important question is:
Is there any better way to get the substrings of each line than
how I have done it using split()
with open('20150731100543_1.txt') as f:
for line in f:
b = line.split(':')
child = SubElement(top, b[0])
c = line.split()
d = c[0].split(':')
property = SubElement(child, d[2])
property.text = c[2] + " " + c[3]
Please forgive me for such lengthy post.
1 & 2: I use etree.tostring and I don't have any of these problems.
3: Multiple split operations can be replaced with regex.
This should work fine:
from lxml import etree
import re
filename_xml = '20150731100543_1.xml'
root = etree.Element('doc')
node = etree.SubElement(root, 'GI-eSTB-MIB-NPH')
f = open('20150731100543_1.txt')
text = f.read()
f.close()
# get tag and value from each row
for tag, value in re.findall('GI-eSTB-MIB-NPH::(.*) = (.*$)', text, re.MULTILINE):
# create child node
etree.SubElement(node, tag).text = value
xml = etree.tostring(root, pretty_print = True, encoding = 'utf-8', xml_declaration=True)
f = open(filename_xml, 'w')
f.write(xml)
f.close

parsing an xml file for unknown elements using python ElementTree

I wish to extract all the tag names and their corresponding data from a multi-purpose xml file. Then save that information into a python dictionary (e.g tag = key, data = value). The catch being the tags names and values are unknown and of unknown quantity.
<some_root_name>
<tag_x>bubbles</tag_x>
<tag_y>car</tag_y>
<tag...>42</tag...>
</some_root_name>
I'm using ElementTree and can successfully extract the root tag and can extract values by referencing the tag names, but haven't been able to find a way to simply iterate over the tags and data without referencing a tag name.
Any help would be great.
Thank you.
from lxml import etree as ET
xmlString = """
<some_root_name>
<tag_x>bubbles</tag_x>
<tag_y>car</tag_y>
<tag...>42</tag...>
</some_root_name> """
document = ET.fromstring(xmlString)
for elementtag in document.getiterator():
print "elementtag name:", elementtag.tag
EDIT:
To read from file instead of from string
document = ET.parse("myxmlfile.xml")
>>> import xml.etree.cElementTree as et
>>> xml = """
... <some_root_name>
... <tag_x>bubbles</tag_x>
... <tag_y>car</tag_y>
... <tag...>42</tag...>
... </some_root_name>
... """
>>> doc = et.fromstring(xml)
>>> print dict((el.tag, el.text) for el in doc)
{'tag_x': 'bubbles', 'tag_y': 'car', 'tag...': '42'}
If you really want 42 instead of '42', you'll need to work a little harder and less elegantly.
You could use xml.sax.handler to parse the XML:
import xml.sax as sax
import xml.sax.handler as saxhandler
import pprint
class TagParser(saxhandler.ContentHandler):
# http://docs.python.org/library/xml.sax.handler.html#contenthandler-objects
def __init__(self):
self.tags = {}
def startElement(self, name, attrs):
self.tag = name
def endElement(self, name):
if self.tag:
self.tags[self.tag] = self.data
self.tag = None
self.data = None
def characters(self, content):
self.data = content
parser = TagParser()
src = '''\
<some_root_name>
<tag_x>bubbles</tag_x>
<tag_y>car</tag_y>
<tag...>42</tag...>
</some_root_name>'''
sax.parseString(src, parser)
pprint.pprint(parser.tags)
yields
{u'tag...': u'42', u'tag_x': u'bubbles', u'tag_y': u'car'}
This could be done using lxml in python
from lxml import etree
myxml = """
<root>
value
</root> """
doc = etree.XML(myxml)
d = {}
for element in doc.iter():
key = element.tag
value = element.text
d[key] = value
print d

Categories

Resources