Append Items to list of dict from xml - python

I thought this was easy, but for some reason, not able to append dict within the list. Overwriting previous data.
for child in data.find_all("item"):
if "Traffic" in child.find("name").string:
self.output["Name"] = child.find("name").string
self.output["LastValue"] = child.find("lastvalue").string
self.results.append(self.output)
print(self.results)
Here is the following output
data = """
<item>
<name>In</name>
<lastvalue>5,000 MByte</lastvalue>
</item>
<item>
<name>Out</name>
<lastvalue>155 MByte</lastvalue>
</item>
<item>
<name>Total</name>
<lastvalue>5,000 MByte</lastvalue>
</item>
I tried running the code, but it always prints the last item.
as it is overwriting the previous data.
output = [{"Name": "In", "LastValue": "5,000 MByte",
"Name": "Out", "LastValue": "5,000 MByte",
"Name": "Total", "LastValue": "5,000 MByte"}]

You can use zip() function to zip values from <name> and <lastvalue>. Then use dict comprehension:
data = """<item>
<name>In</name>
<lastvalue>5,000 MByte</lastvalue>
</item>
<item>
<name>Out</name>
<lastvalue>155 MByte</lastvalue>
</item>
<item>
<name>Total</name>
<lastvalue>5,000 MByte</lastvalue>
</item>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
results = []
results.append( {name.text: lastvalue.text for name, lastvalue in zip(soup.select('name'), soup.select('lastvalue'))} )
print(results)
Prints:
[{'In': '5,000 MByte', 'Out': '155 MByte', 'Total': '5,000 MByte'}]
EDIT: If there are more <lastvalue>:
data = """<item>
<name>In</name>
<lastvalue>5,000 MByte</lastvalue>
</item>
<item>
<name>Out</name>
<lastvalue>155 MByte</lastvalue>
<lastvalue>10,100 MByte</lastvalue>
</item>
<item>
<name>Total</name>
<lastvalue>5,000 MByte</lastvalue>
</item>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
results = []
for name in soup.select('name'):
results.append(
{name.text: [lv.text for lv in name.find_next_siblings('lastvalue')]}
)
print(results)
Prints:
[{'In': ['5,000 MByte']},
{'Out': ['155 MByte', '10,100 MByte']},
{'Total': ['5,000 MByte']}]

Related

Get items from xml Python

I have an xml in python, need to obtain the elements of the "Items" tag in an iterable list.
I need get a iterable list from this XML, for example like it:
Item 1: Bicycle, value $250, iva_tax: 50.30
Item 2: Skateboard, value $120, iva_tax: 25.0
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<data>
<info>Listado de items</info>
<detalle>
<![CDATA[<?xml version="1.0" encoding="UTF-8"?>
<tienda id="tiendaProd" version="1.1.0">
<items>
<item>
<nombre>Bicycle</nombre>
<valor>250</valor>
<data>
<tax name="iva" value="50.30"></tax>
</data>
</item>
<item>
<nombre>Skateboard</nombre>
<valor>120</valor>
<data>
<tax name="iva" value="25.0"></tax>
</data>
</item>
<item>
<nombre>Motorcycle</nombre>
<valor>900</valor>
<data>
<tax name="iva" value="120.50"></tax>
</data>
</item>
</items>
</tienda>]]>
</detalle>
</data>
I am working with
import xml.etree.ElementTree as ET
for example
import xml.etree.ElementTree as ET
xml = ET.fromstring(stringBase64)
ite = xml.find('.//detalle').text
tixml = ET.fromstring(ite)
You can use BeautifulSoup4 (BS4) to do this.
from bs4 import BeautifulSoup
#Read XML file
with open("example.xml", "r") as f:
contents = f.readlines()
#Create Soup object
soup = BeautifulSoup(contents, 'xml')
#find all the item tags
item_tags = soup.find_all("item") #returns everything in the <item> tags
#find the nombre and valor tags within each item
results = {}
for item in item_tags:
num = item.find("nombre").text
val = item.find("valor").text
results[str(num)] = val
#Prints dictionary with key value pairs from the xml
print(results)

Read xml file to dataframe in python

I got a xml file need to read in python as dataframe, it is a part of the xml code:
<?xml version="1.0" encoding="UTF-8"?>
<root>
<data id="root_661191">
<index id="data_162062">
<item id="index_829361_1">173915</item>
<item id="index_829361_2">14712</item>
<item id="index_829361_3">321255</item>
</index>
<property_id id="data_809625">
<item id="property_id_844926_1">88942.0</item>
<item id="property_id_844926_2">88162.0</item>
<item id="property_id_844926_3">80553.0</item>
</property_id>
<addr_street id="data_409265">
<item id="addr_street_959977_1">58 Middleton Street</item>
<item id="addr_street_959977_2">24 Royena Road</item>
<item id="addr_street_959977_3">9 Cafardi Boulevard</item>
</addr_street>
<price id="data_784942">
<item id="price_225606_1">7480000.0</item>
<item id="price_225606_2">7728000.0</item>
<item id="price_225606_3">7659000.0</item>
</price>
</data>
</root>
I try some easier sample data to test my read function, they work. But when I use my function to do this xml file it only produce None in output. I think it might be col names, but I don't know how to fix it, could anyone help me?
The function I used is:
import pandas as pd
import xml.etree.ElementTree as et
def parse_xml(xml_file, df_cols):
xtree = et.parse(xml_file)
xroot = xtree.getroot()
rows = []
for node in xroot:
res = []
res.append(node.attrib.get(df_cols[0]))
for el in df_cols[1:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i]: res[i]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
df_cols = ['index','property_id','addr_street','price']
parse_xml['myxmlfile.xml',df_cols]
I think this is what you want. You should be able to put this in a function if you need
tree = et.parse('myxmlfile.xml')
root = tree.getroot()
df_cols = ['index','property_id','addr_street','price']
mlist = []
for col in df_cols:
for d in root.findall('data'):
# print(d.attrib)
for c in d.findall(col):
# print(c)
# print(c.attrib)
# print(c.attrib.get('id'))
lst = []
for itm in c.findall('item'):
# print(itm.text)
lst.append(itm.text)
# print({col:lst})
mlist.append(pd.DataFrame({col:lst}))
mlist
pd.concat(mlist, axis=1)
Output:
index property_id addr_street price
0 173915 88942.0 58 Middleton Street 7480000.0
1 14712 88162.0 24 Royena Road 7728000.0
2 321255 80553.0 9 Cafardi Boulevard 7659000.0

Python lxml does not support xpath syntax 'starts-with'?

str = """<ROOT>
<ITEM>
<REVENUE_YEAR>2554-02</REVENUE_YEAR>
<REGION>Central</REGION>
</ITEM>
<ITEM>
<REVENUE_YEAR>2552-02</REVENUE_YEAR>
<REGION>Central</REGION>
</ITEM>
<ITEM>
<REVENUE_YEAR>2552-03</REVENUE_YEAR>
<REGION>Central</REGION>
</ITEM>
</ROOT>"""
xml = etree.fromstring(str)
xpath_str = ".//ITEM[starts-with(REVENUE_YEAR,'2554')]"
result = xml.find(xpath_str)
print(result)
Hi, the code above raised SyntaxError: invalid predicate, does it mean lxml do not support starts-with? Any other way to locate the REVENUE_YEAR element(2554-02) by xpath with lxml? Thanks!
It supports xpath but you need to use xpath:
str = """<ROOT>
<ITEM>
<REVENUE_YEAR>2554-02</REVENUE_YEAR>
<REGION>Central</REGION>
</ITEM>
<ITEM>
<REVENUE_YEAR>2552-02</REVENUE_YEAR>
<REGION>Central</REGION>
</ITEM>
<ITEM>
<REVENUE_YEAR>2552-03</REVENUE_YEAR>
<REGION>Central</REGION>
</ITEM>
</ROOT>"""
xml = etree.fromstring(str)
xpath_str = ".//ITEM[starts-with(REVENUE_YEAR,'2554')]"
result = xml.xpath(xpath_str)
print(result) # which is a list containing only one element

Quick way to Upper every value in xml?

I have the following xml:
<Item>
<Platform>itunes</Platform>
<PlatformID>102224185</PlatformID>
<Info>
<LanguageOfMetadata>EN</LanguageOfMetadata>
<Name>Commander In Chief</Name>
<Studio>abc</Studio>
</Info>
<Type>TVSeries</Type>
</Item>
What would be the quickest way to UPPER all the values? For example:
<Item>
<Platform>ITUNES</Platform>
<PlatformID>102224185</PlatformID>
<Info>
<LanguageOfMetadata>EN</LanguageOfMetadata>
<Name>COMMANDER IN CHIEF</Name>
<Studio>ABC</Studio>
</Info>
<Type>TVSERIES</Type>
</Item>
You can find all elements and call upper() on each element's text:
import lxml.etree as ET
data = """<Item>
<Platform>itunes</Platform>
<PlatformID>102224185</PlatformID>
<Info>
<LanguageOfMetadata>EN</LanguageOfMetadata>
<Name>Commander In Chief</Name>
<Studio>abc</Studio>
</Info>
<Type>TVSeries</Type>
</Item>
"""
root = ET.fromstring(data)
for elm in root.xpath("//*"): # //* would find all elements recursively
elm.text = elm.text.upper()
print(ET.tostring(root))
Prints:
<Item>
<Platform>ITUNES</Platform>
<PlatformID>102224185</PlatformID>
<Info>
<LanguageOfMetadata>EN</LanguageOfMetadata>
<Name>COMMANDER IN CHIEF</Name>
<Studio>ABC</Studio>
</Info>
<Type>TVSERIES</Type>
</Item>
This though does not cover cases when you, for example, have a tail of an element - e.g. have <Studio>ABC</Studio>test instead of just <Studio>ABC</Studio>. To support that as well, put the following under the for loop as well:
elm.tail = elm.tail.upper() if elm.tail else None
Here is a way to upper everything, though note that this will include the tags as well:
node = etree.fromstring(etree.tostring(item).upper())
print etree.tostring(node, pretty_print=True)
<ITEM>
<PLATFORM>ITUNES</PLATFORM>
<PLATFORMID>102224185</PLATFORMID>
<INFO>
<LANGUAGEOFMETADATA>EN</LANGUAGEOFMETADATA>
<NAME>COMMANDER IN CHIEF</NAME>
<STUDIO>ABC</STUDIO>
</INFO>
<TYPE>TVSERIES</TYPE>
</ITEM>
Assuming you can parse the XML file you can just rewrite the contents using the .upper() function that is built into python for strings. You can call it like that:
"mystring".upper().

Modify XML file using ElementTree

I am trying to do the folowing with Python:
get "price" value and change it
find "price_qty" and insert new line with new tier and different price based on the "price".
so far I could only find the price and change it and insert line in about correct place but I can't find a way how to get there "item" and "qty" and "price" attributes, nothing has worked so far...
this is my original xml:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<body start="20.04.2014 10:02:60">
<pricelist>
<item>
<name>LEO - red pen</name>
<price>31,4</price>
<price_snc>0</price_snc>
<price_ao>0</price_ao>
<price_qty>
<item qty="150" price="28.20" />
<item qty="750" price="26.80" />
<item qty="1500" price="25.60" />
</price_qty>
<stock>50</stock>
</item>
</pricelist>
the new xml should look this way:
<pricelist>
<item>
<name>LEO - red pen</name>
<price>31,4</price>
<price_snc>0</price_snc>
<price_ao>0</price_ao>
<price_qty>
<item qty="10" price="31.20" /> **-this is the new line**
<item qty="150" price="28.20" />
<item qty="750" price="26.80" />
<item qty="1500" price="25.60" />
</price_qty>
<stock>50</stock>
</item>
</pricelist>
my code so far:
import xml.etree.cElementTree as ET
from xml.etree.ElementTree import Element, SubElement
tree = ET.ElementTree(file='pricelist.xml')
root = tree.getroot()
pos=0
# price - raise the main price and insert new tier
for elem in tree.iterfind('pricelist/item/price'):
price = elem.text
newprice = (float(price.replace(",", ".")))*1.2
newtier = "NEW TIER"
SubElement(root[0][pos][5], newtier)
pos+=1
tree.write('pricelist.xml', "UTF-8")
result:
...
<price_qty>
<item price="28.20" qty="150" />
<item price="26.80" qty="750" />
<item price="25.60" qty="1500" />
<NEW TIER /></price_qty>
thank you for any help.
Don't use fixed indexing. You already have the item element, so why don't use it?
tree = ET.ElementTree(file='pricelist.xml')
root = tree.getroot()
for elem in tree.iterfind('pricelist/item'):
price = elem.findtext('price')
newprice = float(price.replace(",", ".")) * 1.2
newtier = ET.Element("item", qty="10", price="%.2f" % newprice)
elem.find('price_qty').insert(0, newtier)
tree.write('pricelist.xml', "UTF-8")

Categories

Resources