Read xml file to dataframe in python - python

I got a xml file need to read in python as dataframe, it is a part of the xml code:
<?xml version="1.0" encoding="UTF-8"?>
<root>
<data id="root_661191">
<index id="data_162062">
<item id="index_829361_1">173915</item>
<item id="index_829361_2">14712</item>
<item id="index_829361_3">321255</item>
</index>
<property_id id="data_809625">
<item id="property_id_844926_1">88942.0</item>
<item id="property_id_844926_2">88162.0</item>
<item id="property_id_844926_3">80553.0</item>
</property_id>
<addr_street id="data_409265">
<item id="addr_street_959977_1">58 Middleton Street</item>
<item id="addr_street_959977_2">24 Royena Road</item>
<item id="addr_street_959977_3">9 Cafardi Boulevard</item>
</addr_street>
<price id="data_784942">
<item id="price_225606_1">7480000.0</item>
<item id="price_225606_2">7728000.0</item>
<item id="price_225606_3">7659000.0</item>
</price>
</data>
</root>
I try some easier sample data to test my read function, they work. But when I use my function to do this xml file it only produce None in output. I think it might be col names, but I don't know how to fix it, could anyone help me?
The function I used is:
import pandas as pd
import xml.etree.ElementTree as et
def parse_xml(xml_file, df_cols):
xtree = et.parse(xml_file)
xroot = xtree.getroot()
rows = []
for node in xroot:
res = []
res.append(node.attrib.get(df_cols[0]))
for el in df_cols[1:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i]: res[i]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
df_cols = ['index','property_id','addr_street','price']
parse_xml['myxmlfile.xml',df_cols]

I think this is what you want. You should be able to put this in a function if you need
tree = et.parse('myxmlfile.xml')
root = tree.getroot()
df_cols = ['index','property_id','addr_street','price']
mlist = []
for col in df_cols:
for d in root.findall('data'):
# print(d.attrib)
for c in d.findall(col):
# print(c)
# print(c.attrib)
# print(c.attrib.get('id'))
lst = []
for itm in c.findall('item'):
# print(itm.text)
lst.append(itm.text)
# print({col:lst})
mlist.append(pd.DataFrame({col:lst}))
mlist
pd.concat(mlist, axis=1)
Output:
index property_id addr_street price
0 173915 88942.0 58 Middleton Street 7480000.0
1 14712 88162.0 24 Royena Road 7728000.0
2 321255 80553.0 9 Cafardi Boulevard 7659000.0

Related

Parsing Eventlogs XML file to CSV in Python

I am working on a simple python script to extract certain data from an xml file. The xml contains windows events and eventid. Below I am showing the code. It is failing when it needs to extract the data, but it is creating the file but is empty.
from xml.etree import ElementTree as ET
import csv
tree = ET.parse("SecurityLog-rev2.xml")
root = tree.getroot()
url = root[0].tag[:-len("Event")]
fieldnames = ['EventID']
with open ('event_log.csv', 'w') as csvfile:
writecsv = csv.DictWriter(csvfile, fieldnames = fieldnames)
writecsv.writeheader()
for event in root:
system = event.find(url + "System")
output = {}
fields = ['EventID']
# for tag,att in fields:
# output[tag] = system.find(url + tag).attrib[att]
if event.find(url + "EventData") != None:
for data in event.find(url + "EventData"):
name = data.attrib['Name']
output[name] = data.text
writecsv.writerow(output)
<Event xmlns='http://schemas.microsoft.com/win/2004/08/events/event'><System><Provider Name='Microsoft-Windows-Security-Auditing' Guid='{54849625-5478-4994-A5BA-3E3B0328C30D}'/>
<EventID>4634</EventID>
<Version>0</Version><Level>0</Level><Task>12545</Task><Opcode>0</Opcode><Keywords>0x8020000000000000</Keywords><TimeCreated SystemTime='2011-04-16T15:07:53.890625000Z'/>
<EventRecordID>1410962</EventRecordID><Correlation/><Execution ProcessID='452' ThreadID='3900'/><Channel>Security</Channel><Computer>DC01.AFC.com</Computer><Security/></System>
<EventData><Data Name='TargetUserSid'>S-1-5-21-2795111079-3225111112-3329435632-1610</Data>
<Data Name='TargetUserName'>grant.larson</Data>
<Data Name='TargetDomainName'>AFC</Data><Data Name='TargetLogonId'>0x3642df8</Data><Data Name='LogonType'>3</Data></EventData></Event>
I am not sure what exactly you would parse. Here is a solution for the Id and the events:
Your XML File provided above as Input:
<?xml version="1.0" encoding="utf-8"?>
<Event xmlns='http://schemas.microsoft.com/win/2004/08/events/event'>
<System>
<Provider Name='Microsoft-Windows-Security-Auditing' Guid='{54849625-5478-4994-A5BA-3E3B0328C30D}' />
<EventID>4634</EventID>
<Version>0</Version>
<Level>0</Level>
<Task>12545</Task>
<Opcode>0</Opcode>
<Keywords>0x8020000000000000</Keywords>
<TimeCreated SystemTime='2011-04-16T15:07:53.890625000Z' />
<EventRecordID>1410962</EventRecordID>
<Correlation />
<Execution ProcessID='452' ThreadID='3900' />
<Channel>Security</Channel>
<Computer>DC01.AFC.com</Computer>
<Security />
</System>
<EventData>
<Data Name='TargetUserSid'>S-1-5-21-2795111079-3225111112-3329435632-1610</Data>
<Data Name='TargetUserName'>grant.larson</Data>
<Data Name='TargetDomainName'>AFC</Data>
<Data Name='TargetLogonId'>0x3642df8</Data>
<Data Name='LogonType'>3</Data>
</EventData>
</Event>
The program code without regex for catching the namespace:
from xml.etree import ElementTree as ET
import pandas as pd
import csv
tree = ET.parse("SecurityLog-rev2.xml")
root = tree.getroot()
ns = "{http://schemas.microsoft.com/win/2004/08/events/event}"
data = []
for eventID in root.findall(".//"):
if eventID.tag == f"{ns}System":
for e_id in eventID.iter():
if e_id.tag == f'{ns}EventID':
row = "EventID", e_id.text
data.append(row)
if eventID.tag == f"{ns}EventData":
for attr in eventID.iter():
if attr.tag == f'{ns}Data':
#print(attr.attrib)
row = attr.get('Name'), attr.text
data.append(row)
df = pd.DataFrame.from_dict(data, orient='columns')
df.to_csv('event_log.csv', index=False, header=False)
print(df)
Output:
0 1
0 EventID 4634
1 TargetUserSid S-1-5-21-2795111079-3225111112-3329435632-1610
2 TargetUserName grant.larson
3 TargetDomainName AFC
4 TargetLogonId 0x3642df8
5 LogonType 3
The CSV File doesn't contain the index and header:
EventID,4634
TargetUserSid,S-1-5-21-2795111079-3225111112-3329435632-1610
TargetUserName,grant.larson
TargetDomainName,AFC
TargetLogonId,0x3642df8
LogonType,3
You can tanspose() the output:
df.T.to_csv('event_log.csv', index=False, header=False)

Get items from xml Python

I have an xml in python, need to obtain the elements of the "Items" tag in an iterable list.
I need get a iterable list from this XML, for example like it:
Item 1: Bicycle, value $250, iva_tax: 50.30
Item 2: Skateboard, value $120, iva_tax: 25.0
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<data>
<info>Listado de items</info>
<detalle>
<![CDATA[<?xml version="1.0" encoding="UTF-8"?>
<tienda id="tiendaProd" version="1.1.0">
<items>
<item>
<nombre>Bicycle</nombre>
<valor>250</valor>
<data>
<tax name="iva" value="50.30"></tax>
</data>
</item>
<item>
<nombre>Skateboard</nombre>
<valor>120</valor>
<data>
<tax name="iva" value="25.0"></tax>
</data>
</item>
<item>
<nombre>Motorcycle</nombre>
<valor>900</valor>
<data>
<tax name="iva" value="120.50"></tax>
</data>
</item>
</items>
</tienda>]]>
</detalle>
</data>
I am working with
import xml.etree.ElementTree as ET
for example
import xml.etree.ElementTree as ET
xml = ET.fromstring(stringBase64)
ite = xml.find('.//detalle').text
tixml = ET.fromstring(ite)
You can use BeautifulSoup4 (BS4) to do this.
from bs4 import BeautifulSoup
#Read XML file
with open("example.xml", "r") as f:
contents = f.readlines()
#Create Soup object
soup = BeautifulSoup(contents, 'xml')
#find all the item tags
item_tags = soup.find_all("item") #returns everything in the <item> tags
#find the nombre and valor tags within each item
results = {}
for item in item_tags:
num = item.find("nombre").text
val = item.find("valor").text
results[str(num)] = val
#Prints dictionary with key value pairs from the xml
print(results)

Python convert xml to csv with BeautifulSoup doesn't work on azure

I have an issue converting xml to csv file using BeautifulSoup in azure App Service. when I run it locally everything is fine. the first difference is at the soup line:
Parsing code:
file_path = os.path.join(INPUT_DIRECTOR, "test.xml")
source = open(file_path, encoding="utf8")
soup = BeautifulSoup(source.read(), 'xml') #doesn't work on global server
First = [case.text for case in soup.find_all('FirstAtt')]
Second = [case.text for case in soup.find_all('SecondAtt')]
Third= [case.text for case in soup.find_all('ThirdAtt')]
results = list(zip(First, Second))
columns = ['1', '2']
df = pd.DataFrame(results, columns=columns)
df['ID'] = Third[0]
df.to_csv(OUTPUT_DIRECTORY, index=False, encoding='utf-8-sig')
XML:
<?xml version="1.0" encoding="utf-8"?>
<root>
<ThirdAtt>7290027600007</ThirdAtt>
<Items Count="279">
<Item>
<FirstAtt>2021-09-05 08:00</FirstAtt>
<SecondAtt>5411188134985</SecondAtt>
...
</Item>
<Item>
<FirstAtt>2021-09-05 08:00</FirstAtt>
<SecondAtt>5411188135005</SecondAtt>
...
</Item>
...
On local ip run the soup line is able to read the xml file, but on global server run on azure soup isn't reading the file and remaind as :
soup = <?xml version="1.0" encoding="utf-8"?>
Any ideas as to how to solve this?
UPDATE:
Thanks to #balderman, I have converted my soup use as recommended:
root = ET.fromstring(xml)
headers = []
for idx,item in enumerate(root.findall('.//Item')):
data = []
if idx == 0:
headers = [x.tag for x in list(item)]
for h in headers:
data.append(item.find(h).text)
First.append(data[0])
Second.append(data[1])
results = list(zip(First, Second))
...
Is there a way to use generic indexes for the appends incase the place in data[i] will change?
No need for ANY external lib - just use core python ElementTree
import xml.etree.ElementTree as ET
xml = '''<root>
<ThirdAtt>7290027600007</ThirdAtt>
<Items Count="279">
<Item>
<FirstAtt>2021-09-05 08:00</FirstAtt>
<SecondAtt>5411188134985</SecondAtt>
</Item>
<Item>
<FirstAtt>2021-09-05 08:00</FirstAtt>
<SecondAtt>5411188135005</SecondAtt>
</Item>
</Items>
</root>
'''
root = ET.fromstring(xml)
headers = []
for idx,item in enumerate(root.findall('.//Item')):
data = []
if idx == 0:
headers = [x.tag for x in list(item)]
print(','.join(headers))
for h in headers:
data.append(item.find(h).text)
print(','.join(data))
output
FirstAtt,SecondAtt
2021-09-05 08:00,5411188134985
2021-09-05 08:00,5411188135005

Large XML parsing in Python

I am a novice in python and have the following task on hand.
I have a large xml file like the one below:
<Configuration>
<Parameters>
<Component Name='ABC'>
<Group Name='DEF'>
<Parameter Name='GHI'>
<Description>
Some Text
</Description>
<Type>Integer</Type>
<Restriction>
<Level>5</Level>
</Restriction>
<Value>
<Item Value='5'/>
</Value>
</Parameter>
<Parameter Name='JKL'>
<Description>
Some Text
</Description>
<Type>Integer</Type>
<Restriction>
<Level>5</Level>
</Restriction>
<Value>
<Item Value='5'/>
</Value>
</Parameter>
</Group>
<Group Name='MNO'>
<Parameter Name='PQR'>
<Description>
Some Text
</Description>
<Type>Integer</Type>
<Restriction>
<Level>5</Level>
</Restriction>
<Value>
<Item Value='5'/>
</Value>
</Parameter>
<Parameter Name='TUV'>
<Description>
Some Text
</Description>
<Type>Integer</Type>
<Restriction>
<Level>5</Level>
</Restriction>
<Value>
<Item Value='5'/>
</Value>
</Parameter>
</Group>
</Component>
</Parameters>
</Configuration>
In this xml file I have to parse through the component "ABC" go to group "MNO" and then to the parameter "TUV" and under this I have to change the item value to 10.
I have tried using xml.etree.cElementTree but to no use. And lxml dosent support on the server as its running a very old version of python. And I have no permissions to upgrade the version
I have been using the following code to parse and edit a relatively small xml:
def fnXMLModification(ArgStr):
argList = ArgStr.split()
strXMLPath = argList[0]
if not os.path.exists(strXMLPath):
fnlogs("XML File: " + strXMLPath + " does not exist.\n")
return False
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
f=open(strXMLPath, 'rt')
tree = ET.parse(f)
ValueSetFlag = False
AttrSetFlag = False
for strXPath in argList[1:]:
strXPathList = strXPath.split("[")
sxPath = strXPathList[0]
if len(strXPathList)==3:
# both present
AttrSetFlag = True
ValueSetFlag = True
valToBeSet = strXPathList[1].strip("]")
sAttr = strXPathList[2].strip("]")
attrList = sAttr.split(",")
elif len(strXPathList) == 2:
#anyone present
if "=" in strXPathList[1]:
AttrSetFlag = True
sAttr = strXPathList[1].strip("]")
attrList = sAttr.split(",")
else:
ValueSetFlag = True
valToBeSet = strXPathList[1].strip("]")
node = tree.find(sxPath)
if AttrSetFlag:
for att in attrList:
slist = att.split("=")
node.set(slist[0].strip(),slist[1].strip())
if ValueSetFlag:
node.text = valToBeSet
tree.write(strXMLPath)
fnlogs("XML File: " + strXMLPath + " has been modified successfully.\n")
return True
Using this function I am not able to traverse the current xml as it has lot of children attributes or sub groups.
import statement
import xml.etree.cElementTree as ET
Parse content by fromstring method.
root = ET.fromstring(data)
Iterate according our requirement and get target Item tag and change value of Value attribute
for component_tag in root.iter("Component"):
if "Name" in component_tag.attrib and component_tag.attrib['Name']=='ABC':
for group_tag in component_tag.iter("Group"):
if "Name" in group_tag.attrib and group_tag.attrib['Name']=='MNO':
#for value_tag in group_tag.iter("Value"):
for item_tag in group_tag.findall("Parameter[#Name='TUV']/Value/Item"):
item_tag.attrib["Value"] = "10"
We can use Xpath to get target Item tag
for item_tag in root.findall("Parameters/Component[#Name='ABC']/Group[#Name='MNO']/Parameter[#Name='TUV']/Value/Item"):
item_tag.attrib["Value"] = "10"
Use tostring method to get content.
data = ET.tostring(root)

Modify XML file using ElementTree

I am trying to do the folowing with Python:
get "price" value and change it
find "price_qty" and insert new line with new tier and different price based on the "price".
so far I could only find the price and change it and insert line in about correct place but I can't find a way how to get there "item" and "qty" and "price" attributes, nothing has worked so far...
this is my original xml:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<body start="20.04.2014 10:02:60">
<pricelist>
<item>
<name>LEO - red pen</name>
<price>31,4</price>
<price_snc>0</price_snc>
<price_ao>0</price_ao>
<price_qty>
<item qty="150" price="28.20" />
<item qty="750" price="26.80" />
<item qty="1500" price="25.60" />
</price_qty>
<stock>50</stock>
</item>
</pricelist>
the new xml should look this way:
<pricelist>
<item>
<name>LEO - red pen</name>
<price>31,4</price>
<price_snc>0</price_snc>
<price_ao>0</price_ao>
<price_qty>
<item qty="10" price="31.20" /> **-this is the new line**
<item qty="150" price="28.20" />
<item qty="750" price="26.80" />
<item qty="1500" price="25.60" />
</price_qty>
<stock>50</stock>
</item>
</pricelist>
my code so far:
import xml.etree.cElementTree as ET
from xml.etree.ElementTree import Element, SubElement
tree = ET.ElementTree(file='pricelist.xml')
root = tree.getroot()
pos=0
# price - raise the main price and insert new tier
for elem in tree.iterfind('pricelist/item/price'):
price = elem.text
newprice = (float(price.replace(",", ".")))*1.2
newtier = "NEW TIER"
SubElement(root[0][pos][5], newtier)
pos+=1
tree.write('pricelist.xml', "UTF-8")
result:
...
<price_qty>
<item price="28.20" qty="150" />
<item price="26.80" qty="750" />
<item price="25.60" qty="1500" />
<NEW TIER /></price_qty>
thank you for any help.
Don't use fixed indexing. You already have the item element, so why don't use it?
tree = ET.ElementTree(file='pricelist.xml')
root = tree.getroot()
for elem in tree.iterfind('pricelist/item'):
price = elem.findtext('price')
newprice = float(price.replace(",", ".")) * 1.2
newtier = ET.Element("item", qty="10", price="%.2f" % newprice)
elem.find('price_qty').insert(0, newtier)
tree.write('pricelist.xml', "UTF-8")

Categories

Resources