lxml etree get all text before element - python

How to get all text before an element in a etree separated from the text after the element?
from lxml import etree
tree = etree.fromstring('''
<a>
find
<b>
the
</b>
text
<dd></dd>
<c>
before
</c>
<dd></dd>
and after
</a>
''')
What do I want? In this example, the <dd> tags are separators and for all of them
for el in tree.findall('.//dd'):
I would like to have all text before and after them:
[
{
el : <Element dd at 0xsomedistinctadress>,
before : 'find the text',
after : 'before and after'
},
{
el : <Element dd at 0xsomeotherdistinctadress>,
before : 'find the text before',
after : 'and after'
}
]
My idea was to use some kind of placeholders in the tree with which I replace the <dd> tags and then cut the string at that placeholder, but I need the correspondence with the actual element.

There might be a simpler way, but I would use the following XPath expressions:
preceding-sibling::*/text()|preceding::text()
following-sibling::*/text()|following::text()
Sample implementation (definitely violating the DRY principle):
def get_text_before(element):
for item in element.xpath("preceding-sibling::*/text()|preceding-sibling::text()"):
item = item.strip()
if item:
yield item
def get_text_after(element):
for item in element.xpath("following-sibling::*/text()|following-sibling::text()"):
item = item.strip()
if item:
yield item
for el in tree.findall('.//dd'):
before = " ".join(get_text_before(el))
after = " ".join(get_text_after(el))
print {
"el": el,
"before": before,
"after": after
}
Prints:
{'el': <Element dd at 0x10af81488>, 'after': 'before and after', 'before': 'find the text'}
{'el': <Element dd at 0x10af81200>, 'after': 'and after', 'before': 'find the text before'}

Related

Parse xml file to a python list

I have a xml file:
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<Document xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:iso:std:iso:20022:tech:xsd:pain.001.001.03">
<CstmrCdtTrfInitn>
<GrpHdr>
<MsgId>637987745078994894</MsgId>
<CreDtTm>2022-09-14T05:48:27</CreDtTm>
<NbOfTxs>205</NbOfTxs>
<CtrlSum>154761.02</CtrlSum>
<InitgPty>
<Nm> Company</Nm>
</InitgPty>
</GrpHdr>
<PmtInf>
<PmtInfId>20220914054827-154016</PmtInfId>
<PmtMtd>TRF</PmtMtd>
<BtchBookg>true</BtchBookg>
<NbOfTxs>205</NbOfTxs>
<CtrlSum>154761.02</CtrlSum>
<PmtTpInf>
<SvcLvl>
<Cd>SEPA</Cd>
</SvcLvl>
<CtgyPurp>
<Cd>SALA</Cd>
</CtgyPurp>
</PmtTpInf>
<CdtTrfTxInf> <----------------------------------
<Amt>
<InstdAmt Ccy="EUR">1536.96</InstdAmt>
</Amt>
<Cdtr>
<Nm>Achternaam, Voornaam </Nm>
</Cdtr>
<CdtrAcct>
<Id>
<IBAN>NL80RABO0134343443</IBAN>
</Id>
</CdtrAcct>
</CdtTrfTxInf> <------------------------------------
<CdtTrfTxInf> <----------------------------------
<Amt>
<InstdAmt Ccy="EUR">1676.96</InstdAmt>
</Amt>
<Cdtr>
<Nm>Achternaam, Voornaam </Nm>
</Cdtr>
<CdtrAcct>
<Id>
<IBAN>NL80RABO013433222243</IBAN>
</Id>
</CdtrAcct>
</CdtTrfTxInf> <------------------------------------
</CstmrCdtTrfInitn>
</Document>
I use ElementTree:
I want a python list of tuples with the info within the tag (everything between the arrows in the example xml file). So in this example i want al list with 2 tuples.
How can i do that.
I can iterate over the tree, but thats is.
my code:
import xml.etree.ElementTree as ET
tree = ET.parse(xml_file)
root = tree.getroot()
for elem in tree.iter():
print(elem.tag, elem.text) --> i get every tag in the whole file
I rather like to use xmltodict.
First of all, your input data as given is missing a closing </PmtInf> tag towards the end, just before your closing </CstmrCdtTrfInitn> tag. After fixing that, I saved your xml data into a file and did the following:
import xmltodict
with open("input_data.xml", "r") as f:
xml_data = f.read()
xml_dict = xmltodict.parse(xml_data)
You can then access the xml data using dictionary accessors, for example:
xml_dict
>>>{'Document': {'#xmlns:xsi': 'http://www.w3.org/20...a-instance', '#xmlns': 'urn:iso:std:iso:2002...001.001.03', 'CstmrCdtTrfInitn': {...}}}
xml_dict["Document"]
>>>{'#xmlns:xsi': 'http://www.w3.org/20...a-instance', '#xmlns': 'urn:iso:std:iso:2002...001.001.03', 'CstmrCdtTrfInitn': {'GrpHdr': {...}, 'PmtInf': {...}}}
xml_dict["Document"]["CstmrCdtTrfInitn"].keys()
>>>dict_keys(['GrpHdr', 'PmtInf'])
xml_dict["Document"]["CstmrCdtTrfInitn"]["PmtInf"]
{'PmtInfId': '20220914054827-154016', 'PmtMtd': 'TRF', 'BtchBookg': 'true', 'NbOfTxs': '205', 'CtrlSum': '154761.02', 'PmtTpInf': {'SvcLvl': {...}, 'CtgyPurp': {...}}, 'CdtTrfTxInf': [{...}, {...}]}
xml_dict["Document"]["CstmrCdtTrfInitn"]["PmtInf"].keys()
dict_keys(['PmtInfId', 'PmtMtd', 'BtchBookg', 'NbOfTxs', 'CtrlSum', 'PmtTpInf', 'CdtTrfTxInf'])
Then you can loop over your CdtTrfTxInf with:
for item in xml_dict["Document"]["CstmrCdtTrfInitn"]["PmtInf"]["CdtTrfTxInf"]:
print(item)
giving the output:
{'Amt': {'InstdAmt': {'#Ccy': 'EUR', '#text': '1536.96'}}, 'Cdtr': {'Nm': 'Achternaam, Voornaam'}, 'CdtrAcct': {'Id': {'IBAN': 'NL80RABO0134343443'}}}
{'Amt': {'InstdAmt': {'#Ccy': 'EUR', '#text': '1676.96'}}, 'Cdtr': {'Nm': 'Achternaam, Voornaam'}, 'CdtrAcct': {'Id': {'IBAN': 'NL80RABO013433222243'}}}
which you can process as you want.
this is just a speedcode try xd give it a chance and try it :
import xml.etree.ElementTree as ET
tree = ET.parse("fr.xml")
root = tree.getroot()
test = False
for elem in tree.iter():
if elem.tag == "CdtTrfTxInf":
test = True
continue
if test and elem.text.strip() :
print(elem.tag, elem.text)
with result as list of tuple :
import xml.etree.ElementTree as ET
tree = ET.parse("fr.xml")
root = tree.getroot()
test = False
tag = []
textval=[]
for elem in tree.iter():
if elem.tag == "CdtTrfTxInf":
test = True
continue
if test and elem.text.strip() :
tag.append(elem.tag)
textval.append(elem.text)
data = list(zip(tag, textval))
print (data)

convert nested dictionary to file

I have a dictionary that I have modified by pulling from a file and modifying values, and now I want to put it back in a file in a similar format.
dictionary is similar to the following:
d={'a':
{'c':'something else',
'd':{'e':'some item'}
},
'b':
{'z':'something else',
's':{'f':'some item'}
}
}
This is a very long dictionary with nested items and I am guessing I have to use some kind of recursion
I am not sure how to go about this currently so I have no existing code to get from where I am which is a dictionary to a file.
The end result I am trying to get is the following including newlines and spacing:
<a>
c = something else
<d>
e = some item
</d>
</a>
<b>
z = something else
<s>
f = some item
</s>
</b>
d={'a':
{'c':'something else',
'd':{'e':'some item'}
},
'b':
{'z':'something else',
's':{'f':'some item'}
}
}
def printer(d, t=0):
for k, v in d.items():
if isinstance(v, str):
yield '\t' * t + '{} = {}'.format(k, v)
else:
yield '\t' * t + '<{}>'.format(k)
yield from printer(v, t=t+1)
yield '\t' * t + '</{}>'.format(k)
s = '\n'.join(printer(d))
print(s)
prints:
<a>
c = something else
<d>
e = some item
</d>
</a>
<b>
z = something else
<s>
f = some item
</s>
</b>

Not getting XML output as expected

I have Python3 and am following this XML tutorial, https://docs.python.org/3.7/library/xml.etree.elementtree.html
I wish to output a listing of all DailyIndexRatio
DailyIndexRatio {'CUSIP': '912810FD5','IssueDate': '1998-04-15',
'Date':'2019-03-01','RefCPI':'251.23300','IndexRatio':'1.55331' }
....
Instead my code outputs
DailyIndexRatio {}
....
How to fix?
Here is the code
import xml.etree.ElementTree as ET
tree = ET.parse('CPI_20190213.xml')
root = tree.getroot()
print(root.tag)
print(root.attrib)
for child in root:
print(child.tag,child.attrib)
And I downloaded the xml file from https://treasurydirect.gov/xml/CPI_20190213.xml
import xml.etree.ElementTree as ET
tree = ET.parse('CPI_20190213.xml') # Load the XML
root = tree.getroot() # Get XML root element
e = root.findall('.//DailyIndexRatio') # use xpath to find relevant elements
# for each element
for i in e:
# create a dictionary object.
d = {}
# for each child of element
for child in i:
# add the tag name and text value to the dictionary
d[child.tag] = child.text
# print the DailyIndexRatio tag name and dictionary
print (i.tag, d)
Outputs:
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-01', 'RefCPI': '251.23300', 'IndexRatio': '1.55331'}
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-02', 'RefCPI': '251.24845', 'IndexRatio': '1.55341'}
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-03', 'RefCPI': '251.26390', 'IndexRatio': '1.55351'}
DailyIndexRatio {'CUSIP': '912810FD5', 'IssueDate': '1998-04-15', 'Date': '2019-03-04', 'RefCPI': '251.27935', 'IndexRatio': '1.55360'}
...
You're printing the attributes, but that element does not have any attributes.
This is an element with attributes:
<element name="Bob" age="40" sex="male" />
But the element you're trying to print doesn't have those. It has child elements:
<element>
<name>Bob</name>
<age>40</age>
<sex>male</sex>
</element>

Parse multiple similar field values from XML file with Python Regular Expression

I am trying to parse an xml file with regular expression.
Whichever script tag has "catch" alias, I need to collect "type" and "value".
<script type="abc">
<line x="word" size="1" alias="catch" value="4" desc="description"/>
</script>
<script type="xyz">
<line x="state" size="5" alias="catch" value="8" desc="description"/>
</script>
I tried this regular expression with multiline and dotall:
>>> re.findall(r'script\s+type=\"(\w+)\".*alias=\"catch\"\s+value=\"(\d+)\"', a, re.MULTILINE|re.DOTALL)
Output which I am getting is:
[('abc', '8')]
Expected output is:
[('abc', '4'), ('xyz', '8')]
Can someone help me in figuring out what I am missing here?
I recommend using BeautifulSoup. You can parse through the tags and, with a little bit of data re-structuring, easily check for the right alias values and store the related attributes of interest. Like this:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, "lxml")
to_keep = []
for script in soup.find_all("script"):
t = script["type"]
attrs = {
k:v for k, v in [attr.split("=")
for attr in script.contents[0].split()
if "=" in attr]
}
if attrs["alias"] == '"catch"':
to_keep.append({"type": t, "value": attrs["value"]})
to_keep
# [{'type': 'abc', 'value': '"4"'}, {'type': 'xyz', 'value': '"8"'}]
Data:
data = """<script type="abc">
<line x="word" size="1" alias="catch" value="4" desc="description"/>
</script>
<script type="xyz">
<line x="state" size="5" alias="catch" value="8" desc="description"/>
</script>"""
Found the answer. Thanks downvoter. I don't think there was any need to downvote this question.
>>> re.findall(r'script\s+type=\"(\w+)\".*?alias=\"catch\"\s+value=\"(\d+)\".*?\<\/script\>', a, re.MULTILINE|re.DOTALL)
[('abc', '4'), ('xyz', '8')]

Cant pull text out of tags using BeautifulSoup

this works...
print soup.findAll('td',{ "class" : "green center" })
but I only need the text and this wont work...
print soup.findAll('td',{ "class" : "green center" }).text
here is how im using it.
#!python27
import fileinput
import sys
import BeautifulSoup
from BeautifulSoup import BeautifulSoup
import re, urllib
filename = "url.txt"
LineNumber=0
f=open(filename)
lines=f.readlines()
f.close()
for line in lines:
filehandle = urllib.urlopen("http://kat.ph/usearch/"+lines[LineNumber]+"/")
line = filehandle.read()
soup = BeautifulSoup(line)
print soup.findAll('td',{ "class" : "green center" }).text
print soup.findAll('td',{ "class" : "red lasttd center" }).text
print LineNumber
LineNumber=LineNumber+1
filehandle.close()
Post script url.txt is just a list of searches to be ran.
findAll returns a list of all elements that meet your criteria "class":"green center", etc. If you're interested in returning a single element of that list, you can call it by offset:
print soup.findAll('td',{'class':'green center'})[0].text
print soup.findAll('td',{'class':'green center'})[1].text
Alternatively, you could iterate over the list:
for td in soup.findAll('td',{'class':'green center'}):
print td.text
Further, if you were interested in combining all of the text within the list, you could append each element to a list and join:
td_list = []
for td in soup.findAll('td',{'class':'green center'}):
td_list.append(td.text)
print ' '.join(str(x) for x in td_list)
I hope this helped!
findAll will get a list of elements, you need to loop through the list and call .text on each element.
for e in soup.findAll('td',{ "class" : "green center" }):
print e.text

Categories

Resources