lxml etree get all text before element

lxml etree get all text before element - python

How to get all text before an element in a etree separated from the text after the element?
from lxml import etree
tree = etree.fromstring('''
<a>
find
<b>
the
</b>
text
<dd></dd>
<c>
before
</c>
<dd></dd>
and after
</a>
''')
What do I want? In this example, the <dd> tags are separators and for all of them
for el in tree.findall('.//dd'):
I would like to have all text before and after them:
[
{
el : <Element dd at 0xsomedistinctadress>,
before : 'find the text',
after : 'before and after'
},
{
el : <Element dd at 0xsomeotherdistinctadress>,
before : 'find the text before',
after : 'and after'
}
]
My idea was to use some kind of placeholders in the tree with which I replace the <dd> tags and then cut the string at that placeholder, but I need the correspondence with the actual element.

There might be a simpler way, but I would use the following XPath expressions:
preceding-sibling::*/text()|preceding::text()
following-sibling::*/text()|following::text()
Sample implementation (definitely violating the DRY principle):
def get_text_before(element):
for item in element.xpath("preceding-sibling::*/text()|preceding-sibling::text()"):
item = item.strip()
if item:
yield item
def get_text_after(element):
for item in element.xpath("following-sibling::*/text()|following-sibling::text()"):
item = item.strip()
if item:
yield item
for el in tree.findall('.//dd'):
before = " ".join(get_text_before(el))
after = " ".join(get_text_after(el))
print {
"el": el,
"before": before,
"after": after
}
Prints:
{'el': <Element dd at 0x10af81488>, 'after': 'before and after', 'before': 'find the text'}
{'el': <Element dd at 0x10af81200>, 'after': 'and after', 'before': 'find the text before'}

Related

Parse xml file to a python list

I have a xml file:
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<Document xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:iso:std:iso:20022:tech:xsd:pain.001.001.03">
<CstmrCdtTrfInitn>
<GrpHdr>
<MsgId>637987745078994894</MsgId>
<CreDtTm>2022-09-14T05:48:27</CreDtTm>
<NbOfTxs>205</NbOfTxs>
<CtrlSum>154761.02</CtrlSum>
<InitgPty>
<Nm> Company</Nm>
</InitgPty>
</GrpHdr>
<PmtInf>
<PmtInfId>20220914054827-154016</PmtInfId>
<PmtMtd>TRF</PmtMtd>
<BtchBookg>true</BtchBookg>
<NbOfTxs>205</NbOfTxs>
<CtrlSum>154761.02</CtrlSum>
<PmtTpInf>
<SvcLvl>
<Cd>SEPA</Cd>
</SvcLvl>
<CtgyPurp>
<Cd>SALA</Cd>
</CtgyPurp>
</PmtTpInf>
<CdtTrfTxInf> <----------------------------------
<Amt>
<InstdAmt Ccy="EUR">1536.96</InstdAmt>
</Amt>
<Cdtr>
<Nm>Achternaam, Voornaam </Nm>
</Cdtr>
<CdtrAcct>
<Id>
<IBAN>NL80RABO0134343443</IBAN>
</Id>
</CdtrAcct>
</CdtTrfTxInf> <------------------------------------
<CdtTrfTxInf> <----------------------------------
<Amt>
<InstdAmt Ccy="EUR">1676.96</InstdAmt>
</Amt>
<Cdtr>
<Nm>Achternaam, Voornaam </Nm>
</Cdtr>
<CdtrAcct>
<Id>
<IBAN>NL80RABO013433222243</IBAN>
</Id>
</CdtrAcct>
</CdtTrfTxInf> <------------------------------------
</CstmrCdtTrfInitn>
</Document>
I use ElementTree:
I want a python list of tuples with the info within the tag (everything between the arrows in the example xml file). So in this example i want al list with 2 tuples.
How can i do that.
I can iterate over the tree, but thats is.
my code:
import xml.etree.ElementTree as ET
tree = ET.parse(xml_file)
root = tree.getroot()
for elem in tree.iter():
print(elem.tag, elem.text) --> i get every tag in the whole file

I rather like to use xmltodict.
First of all, your input data as given is missing a closing </PmtInf> tag towards the end, just before your closing </CstmrCdtTrfInitn> tag. After fixing that, I saved your xml data into a file and did the following:
import xmltodict
with open("input_data.xml", "r") as f:
xml_data = f.read()
xml_dict = xmltodict.parse(xml_data)
You can then access the xml data using dictionary accessors, for example:
xml_dict
>>>{'Document': {'#xmlns:xsi': 'http://www.w3.org/20...a-instance', '#xmlns': 'urn:iso:std:iso:2002...001.001.03', 'CstmrCdtTrfInitn': {...}}}
xml_dict["Document"]
>>>{'#xmlns:xsi': 'http://www.w3.org/20...a-instance', '#xmlns': 'urn:iso:std:iso:2002...001.001.03', 'CstmrCdtTrfInitn': {'GrpHdr': {...}, 'PmtInf': {...}}}
xml_dict["Document"]["CstmrCdtTrfInitn"].keys()
>>>dict_keys(['GrpHdr', 'PmtInf'])
xml_dict["Document"]["CstmrCdtTrfInitn"]["PmtInf"]
{'PmtInfId': '20220914054827-154016', 'PmtMtd': 'TRF', 'BtchBookg': 'true', 'NbOfTxs': '205', 'CtrlSum': '154761.02', 'PmtTpInf': {'SvcLvl': {...}, 'CtgyPurp': {...}}, 'CdtTrfTxInf': [{...}, {...}]}
xml_dict["Document"]["CstmrCdtTrfInitn"]["PmtInf"].keys()
dict_keys(['PmtInfId', 'PmtMtd', 'BtchBookg', 'NbOfTxs', 'CtrlSum', 'PmtTpInf', 'CdtTrfTxInf'])
Then you can loop over your CdtTrfTxInf with:
for item in xml_dict["Document"]["CstmrCdtTrfInitn"]["PmtInf"]["CdtTrfTxInf"]:
print(item)
giving the output:
{'Amt': {'InstdAmt': {'#Ccy': 'EUR', '#text': '1536.96'}}, 'Cdtr': {'Nm': 'Achternaam, Voornaam'}, 'CdtrAcct': {'Id': {'IBAN': 'NL80RABO0134343443'}}}
{'Amt': {'InstdAmt': {'#Ccy': 'EUR', '#text': '1676.96'}}, 'Cdtr': {'Nm': 'Achternaam, Voornaam'}, 'CdtrAcct': {'Id': {'IBAN': 'NL80RABO013433222243'}}}
which you can process as you want.

this is just a speedcode try xd give it a chance and try it :
import xml.etree.ElementTree as ET
tree = ET.parse("fr.xml")
root = tree.getroot()
test = False
for elem in tree.iter():
if elem.tag == "CdtTrfTxInf":
test = True
continue
if test and elem.text.strip() :
print(elem.tag, elem.text)
with result as list of tuple :
import xml.etree.ElementTree as ET
tree = ET.parse("fr.xml")
root = tree.getroot()
test = False
tag = []
textval=[]
for elem in tree.iter():
if elem.tag == "CdtTrfTxInf":
test = True
continue
if test and elem.text.strip() :
tag.append(elem.tag)
textval.append(elem.text)
data = list(zip(tag, textval))
print (data)

convert nested dictionary to file

I have a dictionary that I have modified by pulling from a file and modifying values, and now I want to put it back in a file in a similar format.
dictionary is similar to the following:
d={'a':
{'c':'something else',
'd':{'e':'some item'}
},
'b':
{'z':'something else',
's':{'f':'some item'}
}
}
This is a very long dictionary with nested items and I am guessing I have to use some kind of recursion
I am not sure how to go about this currently so I have no existing code to get from where I am which is a dictionary to a file.
The end result I am trying to get is the following including newlines and spacing:
<a>
c = something else
<d>
e = some item
</d>
</a>
<b>
z = something else
<s>
f = some item
</s>
</b>

d={'a':
{'c':'something else',
'd':{'e':'some item'}
},
'b':
{'z':'something else',
's':{'f':'some item'}
}
}
def printer(d, t=0):
for k, v in d.items():
if isinstance(v, str):
yield '\t' * t + '{} = {}'.format(k, v)
else:
yield '\t' * t + '<{}>'.format(k)
yield from printer(v, t=t+1)
yield '\t' * t + '</{}>'.format(k)
s = '\n'.join(printer(d))
print(s)
prints:
<a>
c = something else
<d>
e = some item
</d>
</a>
<b>
z = something else
<s>
f = some item
</s>
</b>

Not getting XML output as expected

I have Python3 and am following this XML tutorial, https://docs.python.org/3.7/library/xml.etree.elementtree.html
I wish to output a listing of all DailyIndexRatio
DailyIndexRatio {'CUSIP': '912810FD5','IssueDate': '1998-04-15',
'Date':'2019-03-01','RefCPI':'251.23300','IndexRatio':'1.55331' }
....
Instead my code outputs
DailyIndexRatio {}
....
How to fix?
Here is the code
import xml.etree.ElementTree as ET
tree = ET.parse('CPI_20190213.xml')
root = tree.getroot()
print(root.tag)
print(root.attrib)
for child in root:
print(child.tag,child.attrib)
And I downloaded the xml file from https://treasurydirect.gov/xml/CPI_20190213.xml

import xml.etree.ElementTree as ET
tree = ET.parse('CPI_20190213.xml') # Load the XML
root = tree.getroot() # Get XML root element
e = root.findall('.//DailyIndexRatio') # use xpath to find relevant elements
# for each element
for i in e:
# create a dictionary object.
d = {}
# for each child of element
for child in i:
# add the tag name and text value to the dictionary
d[child.tag] = child.text
# print the DailyIndexRatio tag name and dictionary
print (i.tag, d)
Outputs:
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-01', 'RefCPI': '251.23300', 'IndexRatio': '1.55331'}
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-02', 'RefCPI': '251.24845', 'IndexRatio': '1.55341'}
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-03', 'RefCPI': '251.26390', 'IndexRatio': '1.55351'}
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-04', 'RefCPI': '251.27935', 'IndexRatio': '1.55360'}
...

You're printing the attributes, but that element does not have any attributes.
This is an element with attributes:
<element name="Bob" age="40" sex="male" />
But the element you're trying to print doesn't have those. It has child elements:
<element>
<name>Bob</name>
<age>40</age>
<sex>male</sex>
</element>

Parse multiple similar field values from XML file with Python Regular Expression

I am trying to parse an xml file with regular expression.
Whichever script tag has "catch" alias, I need to collect "type" and "value".
<script type="abc">
<line x="word" size="1" alias="catch" value="4" desc="description"/>
</script>
<script type="xyz">
<line x="state" size="5" alias="catch" value="8" desc="description"/>
</script>
I tried this regular expression with multiline and dotall:
>>> re.findall(r'script\s+type=\"(\w+)\".*alias=\"catch\"\s+value=\"(\d+)\"', a, re.MULTILINE|re.DOTALL)
Output which I am getting is:
[('abc', '8')]
Expected output is:
[('abc', '4'), ('xyz', '8')]
Can someone help me in figuring out what I am missing here?

I recommend using BeautifulSoup. You can parse through the tags and, with a little bit of data re-structuring, easily check for the right alias values and store the related attributes of interest. Like this:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, "lxml")
to_keep = []
for script in soup.find_all("script"):
t = script["type"]
attrs = {
k:v for k, v in [attr.split("=")
for attr in script.contents[0].split()
if "=" in attr]
}
if attrs["alias"] == '"catch"':
to_keep.append({"type": t, "value": attrs["value"]})
to_keep
# [{'type': 'abc', 'value': '"4"'}, {'type': 'xyz', 'value': '"8"'}]
Data:
data = """<script type="abc">
<line x="word" size="1" alias="catch" value="4" desc="description"/>
</script>
<script type="xyz">
<line x="state" size="5" alias="catch" value="8" desc="description"/>
</script>"""

Found the answer. Thanks downvoter. I don't think there was any need to downvote this question.
>>> re.findall(r'script\s+type=\"(\w+)\".*?alias=\"catch\"\s+value=\"(\d+)\".*?\<\/script\>', a, re.MULTILINE|re.DOTALL)
[('abc', '4'), ('xyz', '8')]

Cant pull text out of tags using BeautifulSoup

this works...
print soup.findAll('td',{ "class" : "green center" })
but I only need the text and this wont work...
print soup.findAll('td',{ "class" : "green center" }).text
here is how im using it.
#!python27
import fileinput
import sys
import BeautifulSoup
from BeautifulSoup import BeautifulSoup
import re, urllib
filename = "url.txt"
LineNumber=0
f=open(filename)
lines=f.readlines()
f.close()
for line in lines:
filehandle = urllib.urlopen("http://kat.ph/usearch/"+lines[LineNumber]+"/")
line = filehandle.read()
soup = BeautifulSoup(line)
print soup.findAll('td',{ "class" : "green center" }).text
print soup.findAll('td',{ "class" : "red lasttd center" }).text
print LineNumber
LineNumber=LineNumber+1
filehandle.close()
Post script url.txt is just a list of searches to be ran.

findAll returns a list of all elements that meet your criteria "class":"green center", etc. If you're interested in returning a single element of that list, you can call it by offset:
print soup.findAll('td',{'class':'green center'})[0].text
print soup.findAll('td',{'class':'green center'})[1].text
Alternatively, you could iterate over the list:
for td in soup.findAll('td',{'class':'green center'}):
print td.text
Further, if you were interested in combining all of the text within the list, you could append each element to a list and join:
td_list = []
for td in soup.findAll('td',{'class':'green center'}):
td_list.append(td.text)
print ' '.join(str(x) for x in td_list)
I hope this helped!

findAll will get a list of elements, you need to loop through the list and call .text on each element.
for e in soup.findAll('td',{ "class" : "green center" }):
print e.text

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

lxml etree get all text before element - python

Related

Parse xml file to a python list

convert nested dictionary to file

Not getting XML output as expected

Parse multiple similar field values from XML file with Python Regular Expression

Cant pull text out of tags using BeautifulSoup

Categories

Resources