The code below goes through the xml files and parses them into a single csv file
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
from pathlib import Path
directory = 'path to a folder with xml files'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str, Path(directory).glob('**/*.xml')))
print(xml_files_list)
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
repeated_values = dict()
for k,v in sn.attrib.items():
repeated_values[k] = v
for rn in sn.findall('.//Rational'):
repeated_values['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
repeated_values['qualify'] = qu.text
for ds in sn.findall('.//Description'):
repeated_values['description_txt'] = ds.text
repeated_values['description_num'] = ds.attrib['num']
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
for key in repeated_values.keys():
row[key] = repeated_values[key]
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
This is the xml file.
<?xml version="1.0" encoding="utf-8"?>
<ProjectData>
<Phones>
<Date />
<Prog />
<Box />
<Feature />
<IN>MAFWDS</IN>
<Set>234234</Set>
<Pr>23423</Pr>
<Number>afasfhrtv</Number>
<Simple>dfasd</Simple>
<Nr />
<Get>6070106091</Get>
<Reno>1233</Reno>
</Phones>
<FINAL>
<START id="B001" service_code="0x5196">
<Docs Docs_type="START">
<Rational>225196</Rational>
<Qualify>6251960000A0DE</Qualify>
</Docs>
<Description num="1213f2312">The parameter</Description>
<DataFile dg="12" dg_id="let">
<SetData value="32" />
</DataFile>
</START>
<START id="C003" service_code="0x517B">
<Docs Docs_type="START">
<Rational>23423</Rational>
<Qualify>342342</Qualify>
</Docs>
<Description num="3423423f3423">The third</Description>
<DataFile dg="55" dg_id="big">
<SetData x="E1" value="21259" />
<SetData x="E2" value="02" />
</DataFile>
</START>
<START id="Z048" service_code="0x5198">
<RawData rawdata_type="ASDS">
<Rational>225198</Rational>
<Qualify>343243324234234</Qualify>
</RawData>
<Description num="434234234">The forth</Description>
<DataFile unit="21" unit_id="FEDS">
<FileX unit="eg" discrete="false" axis_pts="19" name="Vsome" text_id="bx5" unit_id="GDFSD" />
<SetData xin="5" xax="233" value="323" />
<SetData xin="123" xax="77" value="555" />
<SetData xin="17" xax="65" value="23" />
</DataFile>
</START>
</FINAL>
</ProjectData>
This is how the output looks like
Currently struggling to modify the code , so it goes to Phones (which is another child of Projectdata) takes elements from Set and Get attaches them together with _ and parses them into the first column that has the header names ** Identify**
The picture bellow shows how It should look.
Modify your headers line to
headers = ['identify', 'id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
p_get = tree.find('.//Phones/Get').text
p_set = tree.find('.//Phones/Set').text
and add this info to the row_data just before the line writer.writerow(row_data)
like this:
row_data.insert(0, p_get + '_' + p_set)
Update
row_data[0] = p_get + '_' + p_set
Related
I am working on a simple python script to extract certain data from an xml file. The xml contains windows events and eventid. Below I am showing the code. It is failing when it needs to extract the data, but it is creating the file but is empty.
from xml.etree import ElementTree as ET
import csv
tree = ET.parse("SecurityLog-rev2.xml")
root = tree.getroot()
url = root[0].tag[:-len("Event")]
fieldnames = ['EventID']
with open ('event_log.csv', 'w') as csvfile:
writecsv = csv.DictWriter(csvfile, fieldnames = fieldnames)
writecsv.writeheader()
for event in root:
system = event.find(url + "System")
output = {}
fields = ['EventID']
# for tag,att in fields:
# output[tag] = system.find(url + tag).attrib[att]
if event.find(url + "EventData") != None:
for data in event.find(url + "EventData"):
name = data.attrib['Name']
output[name] = data.text
writecsv.writerow(output)
<Event xmlns='http://schemas.microsoft.com/win/2004/08/events/event'><System><Provider Name='Microsoft-Windows-Security-Auditing' Guid='{54849625-5478-4994-A5BA-3E3B0328C30D}'/>
<EventID>4634</EventID>
<Version>0</Version><Level>0</Level><Task>12545</Task><Opcode>0</Opcode><Keywords>0x8020000000000000</Keywords><TimeCreated SystemTime='2011-04-16T15:07:53.890625000Z'/>
<EventRecordID>1410962</EventRecordID><Correlation/><Execution ProcessID='452' ThreadID='3900'/><Channel>Security</Channel><Computer>DC01.AFC.com</Computer><Security/></System>
<EventData><Data Name='TargetUserSid'>S-1-5-21-2795111079-3225111112-3329435632-1610</Data>
<Data Name='TargetUserName'>grant.larson</Data>
<Data Name='TargetDomainName'>AFC</Data><Data Name='TargetLogonId'>0x3642df8</Data><Data Name='LogonType'>3</Data></EventData></Event>
I am not sure what exactly you would parse. Here is a solution for the Id and the events:
Your XML File provided above as Input:
<?xml version="1.0" encoding="utf-8"?>
<Event xmlns='http://schemas.microsoft.com/win/2004/08/events/event'>
<System>
<Provider Name='Microsoft-Windows-Security-Auditing' Guid='{54849625-5478-4994-A5BA-3E3B0328C30D}' />
<EventID>4634</EventID>
<Version>0</Version>
<Level>0</Level>
<Task>12545</Task>
<Opcode>0</Opcode>
<Keywords>0x8020000000000000</Keywords>
<TimeCreated SystemTime='2011-04-16T15:07:53.890625000Z' />
<EventRecordID>1410962</EventRecordID>
<Correlation />
<Execution ProcessID='452' ThreadID='3900' />
<Channel>Security</Channel>
<Computer>DC01.AFC.com</Computer>
<Security />
</System>
<EventData>
<Data Name='TargetUserSid'>S-1-5-21-2795111079-3225111112-3329435632-1610</Data>
<Data Name='TargetUserName'>grant.larson</Data>
<Data Name='TargetDomainName'>AFC</Data>
<Data Name='TargetLogonId'>0x3642df8</Data>
<Data Name='LogonType'>3</Data>
</EventData>
</Event>
The program code without regex for catching the namespace:
from xml.etree import ElementTree as ET
import pandas as pd
import csv
tree = ET.parse("SecurityLog-rev2.xml")
root = tree.getroot()
ns = "{http://schemas.microsoft.com/win/2004/08/events/event}"
data = []
for eventID in root.findall(".//"):
if eventID.tag == f"{ns}System":
for e_id in eventID.iter():
if e_id.tag == f'{ns}EventID':
row = "EventID", e_id.text
data.append(row)
if eventID.tag == f"{ns}EventData":
for attr in eventID.iter():
if attr.tag == f'{ns}Data':
#print(attr.attrib)
row = attr.get('Name'), attr.text
data.append(row)
df = pd.DataFrame.from_dict(data, orient='columns')
df.to_csv('event_log.csv', index=False, header=False)
print(df)
Output:
0 1
0 EventID 4634
1 TargetUserSid S-1-5-21-2795111079-3225111112-3329435632-1610
2 TargetUserName grant.larson
3 TargetDomainName AFC
4 TargetLogonId 0x3642df8
5 LogonType 3
The CSV File doesn't contain the index and header:
EventID,4634
TargetUserSid,S-1-5-21-2795111079-3225111112-3329435632-1610
TargetUserName,grant.larson
TargetDomainName,AFC
TargetLogonId,0x3642df8
LogonType,3
You can tanspose() the output:
df.T.to_csv('event_log.csv', index=False, header=False)
I have an xml file called persons.xml in the following format:
<?xml version="1.0" encoding="UTF-8"?>
<persons>
<person id="1" name="John">
<city id="21" name="New York"/>
</person>
<person id="2" name="Mary">
<city id="22" name="Los Angeles"/>
</person>
</persons>
I want to export to a file the list of person names along with the city names
import pandas as pd
import xml.etree.ElementTree as ET
tree = ET.parse('./persons.xml')
root = tree.getroot()
df_cols = ["person_name", "city_name"]
rows = []
for node in root:
person_name = node.attrib.get("name")
rows.append({"person_name": person_name})
out_df = pd.DataFrame(rows, columns = df_cols)
out_df
Obviously this part of the code will only work for obtaining the name as it’s part of the root, but I can’t figure out how to loop through the child nodes too and obtain this info. Do I need to append something to root to iterate over the child nodes?
I can obtain everything using root.getchildren but it doesn’t allow me to return only the child nodes:
children = root.getchildren()
for child in children:
ElementTree.dump(child)
Is there a good way to get this information?
See below
import xml.etree.ElementTree as ET
import pandas as pd
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<persons>
<person id="1" name="John">
<city id="21" name="New York" />
</person>
<person id="2" name="Mary">
<city id="22" name="Los Angeles" />
</person>
</persons>'''
root = ET.fromstring(xml)
data = []
for p in root.findall('.//person'):
data.append({'parson': p.attrib['name'], 'city': p.find('city').attrib['name']})
df = pd.DataFrame(data)
print(df)
output
parson city
0 John New York
1 Mary Los Angeles
I have an XML file where I need to replace the value inside filter tag. I have tried to parse this XML but looks like some error in this which I am unable to resolve. Can anybody please help me in replacing the value.
path = """
<SOAP:Envelope xmlns:SOAP="http://schemas.xmlsoap.org/soap/envelope/">
<SOAP:Header>
<header xmlns="xmlapi_1.0">
<security>
<user>testuser</user>
<password hashed="false">testpassword</password>
</security>
<requestID>XML_API_client#n</requestID>
</header>
</SOAP:Header>
<SOAP:Body>
<find>
<fullClassName>equipment.PhysicalPort</fullClassName>
<filter>
<equal name="siteId" value="x.x.x.x." />
</filter>
<resultFilter>
<attribute>objectFullName</attribute>
<attribute>displayedName</attribute>
<attribute>portName</attribute>
<attribute>description</attribute>
<attribute>lagMembershipId</attribute>
<attribute>encapType</attribute>
<attribute>mode</attribute>
<attribute>speed</attribute>
<attribute>mtuValue</attribute>
</resultFilter>
</find>
<find xmlns="xmlapi_1.0">
<fullClassName>ethernetequipment.EthernetPortSpecifics</fullClassName>
<filter>
<equal name="siteId" value="x.x.x.x." />
</filter>
<resultFilter>
<attribute>autoNegotiate</attribute>
<attribute>downWhenLooped</attribute>
</resultFilter>
</find>
"""
The other way I tried:
path = '../request.xml'
IP = '"' + '63.130.111.89' + '"' + '/>'
f = open('test.xml', 'w+')
for line in open(path, 'r'):
output_line = line
string1, string2 = "siteId", "value"
if string1 in output_line:
value = output_line.split("value"'=', 1)[1]
output_line = str.replace(output_line, value, IP)
f.write(output_line)
The simple code down below prints certain elements and their attributes in a dataframe.
It iterates through an XML files, looks for these elements and just prints them out
Code
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse('1last.xml')
root = tree.getroot()
for neighbor in root.iter('Description'):
print(neighbor.attrib, neighbor.text)
for neighbor in root.iter('SetData'):
print(neighbor.attrib)
for neighbor in root.iter('FileX'):
print(neighbor.attrib)
for neighbor in root.iter('FileY'):
print(neighbor.attrib)
Output
I want to export the output into a Excel table form but It doesn’t seem to work
I have tried this
export_excel = root.to_excel (r'C:\Users\fsdf.LAPTOP-E8A1PPIN\Desktop\test\export_dataframe.xlsx', index = None, header=True)
but I got the error saying “AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'to_excel'
This my xml file
<?xml version="1.0" encoding="utf-8"?>
<ProjectData>
<FINAL>
<START id="ID0001" service_code="0x5196">
<Docs Docs_type="START">
<Rational>225196</Rational>
<Qualify>6251960000A0DE</Qualify>
</Docs>
<Description num="1213f2312">The parameter</Description>
<SetFile dg="" dg_id="">
<SetData value="32" />
</SetFile>
</START>
<START id="DG0003" service_code="0x517B">
<Docs Docs_type="START">
<Rational>23423</Rational>
<Qualify>342342</Qualify>
</Docs>
<Description num="3423423f3423">The third</Description>
<SetFile dg="" dg_id="">
<FileX dg="" axis_pts="2" name="" num="" dg_id="" />
<FileY unit="" axis_pts="20" name="TOOLS" text_id="23423" unit_id="" />
<SetData x="E1" value="21259" />
<SetData x="E2" value="0" />
</SetFile>
</START>
<START id="ID0048" service_code="0x5198">
<RawData rawdata_type="OPDATA">
<Request>225198</Request>
<Response>343243324234234</Response>
</RawData>
<Meaning text_id="434234234">The forth</Meaning>
<ValueDataset unit="m" unit_id="FEDS">
<FileX dg="kg" discrete="false" axis_pts="19" name="weight" text_id="SDF3" unit_id="SDGFDS" />
<SetData xin="sdf" xax="233" value="323" />
<SetData xin="123" xax="213" value="232" />
<SetData xin="2321" xax="232" value="23" />
</ValueDataset>
</START>
</FINAL>
</ProjectData>
This is what I would want the table to look like.
One approach would be to use a library such as openpyxl to write the Excel file directly. The following shows how this could be done:
import openpyxl
from bs4 import BeautifulSoup
with open('1last.xml') as f_input:
soup = BeautifulSoup(f_input, 'lxml')
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Sheet1"
ws.append(["Description", "num", "text"])
for description in soup.find_all("description"):
ws.append(["", description['num'], description.text])
ws.append(["SetData", "x", "value", "xin", "xax"])
for setdata in soup.find_all("setdata"):
ws.append(["", setdata.get('x', ''), setdata.get('value', ''), setdata.get('xin', ''), setdata.get('xax', '')])
wb.save(filename="1last.xlsx")
This would create an Excel file looking like:
For each instrumentData in the below XML file, I need to print the id and the values in the data value=" " field. (the value between the quotes) All this using python 2.7
This is my XML file:
<instrumentDatas>
<instrumentData>
<instrument>
<id>Stephan</id>
</instrument>
<data value="A" />
<data value="B" />
<data value="C" />
</instrumentData>
<instrumentData>
<instrument>
<id>Patrick</id>
</instrument>
<data value="F" />
<data value="G" />
<data value="H" />
</instrumentData>
I am able to print the id for each instrumentData with the below code but cant figure out how to print the values.
from xml.dom import minidom
xmldoc=minidom.parse("C:/Users/Desktop/PythonXMLproject/Smallfile.xml")
instrumentDatas = xmldoc.getElementsByTagName("instrumentDatas")[0]
instrumentDatax= instrumentDatas.getElementsByTagName("instrumentData")
for instrumentData in instrumentDatax:
idx=instrumentData.getElementsByTagName("id")[0].firstChild.data
print(idx)
Thank you
import xml.dom.minidom
DOMTree = xml.dom.minidom.parse("C:/Users/Desktop/PythonXMLproject/Smallfile.xml")
collection = DOMTree.documentElement
instrumentDatas = collection.getElementsByTagName("instrumentData")
for instrumentData in instrumentDatas:
idx = instrumentData.getElementsByTagName("id")[0].firstChild.data
print "ID: %s" % idx
datas = instrumentData.getElementsByTagName("data")
for data in datas:
if data.hasAttribute('value'):
value = data.getAttribute('value')
print "Value: %s" % value