Parse xml file with python - python

I have this simple xml file:
<BSB>
<APPLSUMMARY>
<MAIN W="S1" X="{ND}"/>
<COUNTS Z="0" AB="0" BB="0" CB="0" DB="0" EB="0" FB="0" GB="{ND}"/>
<SCOTDEBT OQB="{ND}"/>
<NOTICES HB="0" IB="3"/>
<SUB_BLOCKS C="3" D="3" E="1" F="0"/>
<ALIAS_NO UPB="0" VPB="{ND}" WPB="0"/>
<ASSOC_NO DD="0" ED="0" AC="0"/>
<ALERTSUMM PB="0" QB="0" RB="{ND}" SB="{ND}" TB="{ND}" UB="{ND}"/>
<HHOSUMM BC="{ND}" RGB="{ND}"/>
<TPD INB="{ND}" JNB="{ND}" KNB="{ND}" LNB="{ND}"/>
<OCCUPANCY AD="1"/>
<DECEASED LQB="1" FCC="{ND}" GCC="{ND}" HCC="{ND}" ICC="{ND}"/>
<IMPAIRED MQB="0"/>
<ACTIVITY JCC="{ND}" KCC="{ND}" LCC="{ND}"/>
<ADVERSE MCC="{ND}" HHC="{ND}"/>
</APPLSUMMARY>
</BSB>
I want to create in python a csv file that contains only the DECEASED contents in columns like this:
So, I am trying to get the values of the DECEASED bit and align them in columns.
I have tried this:
import xml.etree.ElementTree as ET
import io
parsed = objectify.parse(open(path)) // path is where the xml file is saved
root = parsed.getroot()
data = []
for elt in root.BSB.DECEASED:
el_data = {}
for child in elt.getchildren():
el_data[child.tag] = child.text
data.append(el_data)
perf =pd.DataFrame(data).drop_duplicates(subset=None, keep='first', inplace=False)
print(perf)
perf.to_csv('DECESEAD.csv')
I get an empty dataset:
Empty DataFrame
Columns: []
Index: []
Can anyone help me get the values inside the DECEASED tag, please?

The code below collects the data you are looking for
import xml.etree.ElementTree as ET
from typing import Dict
xml = '''<BSB>
<APPLSUMMARY>
<MAIN W="S1" X="{ND}"/>
<COUNTS Z="0" AB="0" BB="0" CB="0" DB="0" EB="0" FB="0" GB="{ND}"/>
<SCOTDEBT OQB="{ND}"/>
<NOTICES HB="0" IB="3"/>
<SUB_BLOCKS C="3" D="3" E="1" F="0"/>
<ALIAS_NO UPB="0" VPB="{ND}" WPB="0"/>
<ASSOC_NO DD="0" ED="0" AC="0"/>
<ALERTSUMM PB="0" QB="0" RB="{ND}" SB="{ND}" TB="{ND}" UB="{ND}"/>
<HHOSUMM BC="{ND}" RGB="{ND}"/>
<TPD INB="{ND}" JNB="{ND}" KNB="{ND}" LNB="{ND}"/>
<OCCUPANCY AD="1"/>
<DECEASED LQB="1" FCC="{ND}" GCC="{ND}" HCC="{ND}" ICC="{ND}"/>
<IMPAIRED MQB="0"/>
<ACTIVITY JCC="{ND}" KCC="{ND}" LCC="{ND}"/>
<ADVERSE MCC="{ND}" HHC="{ND}"/>
</APPLSUMMARY>
</BSB>'''
def _clean_dict(attributes: Dict) -> Dict:
result = {}
for k, v in attributes.items():
if v[0] == '{':
val = v[1:-1]
else:
val = v
result[k] = val
return result
data = []
root = ET.fromstring(xml)
for d in root.findall('.//DECEASED'):
data.append(_clean_dict(d.attrib))
print(data)
output (list of dicts)
[{'LQB': '1', 'FCC': 'ND', 'GCC': 'ND', 'HCC': 'ND', 'ICC': 'ND'}]

Related

Excluding CSV Columns from Data Dictionary

I am attempting to create a data dictionary that does not include all of the columns in the source csv file. I have managed to create one that does include all the columns, but want to exclude some of them.
The code I am using is this:
input_file = csv.DictReader(open(DATA_FILE))
fieldnames = input_file.fieldnames
data_large_countries = {fn: [] for fn in fieldnames}
for line in input_file:
for k, v in line.items():
if (v == ''):
v=0
try:
data_large_countries[k].append(int(v))
except ValueError:
try:
data_large_countries[k].append(float(v))
except ValueError:
data_large_countries[k].append(v)
for k, v in data_large_countries.items():
data_large_countries[k] = np.array(v)
print(data_large_countries.keys())
with the output:
dict_keys(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_vaccinations', 'new_vaccinations_smoothed', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred', 'new_vaccinations_smoothed_per_million', 'new_people_vaccinated_smoothed', 'new_people_vaccinated_smoothed_per_hundred', 'stringency_index', 'population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand', 'life_expectancy', 'human_development_index', 'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative', 'excess_mortality', 'excess_mortality_cumulative_per_million'])
I only need 6 of these keys in my data dictionary. How do I amend my code to get only the keys I want?

different return types for getpath() in lxml

I have folders full of XML files which I want to parse to a dataframe. The following functions iterate through an XML tree recursively and return a dataframe with three columns: path, attributes and text.
def XML2DF(filename,df1,MAX_DEPTH=20):
with open(filename) as f:
xml_str = f.read()
tree = etree.fromstring(xml_str)
df1 = recursive_parseXML2DF(tree, df1, MAX_DEPTH=MAX_DEPTH)
return
def recursive_parseXML2DF(element, df1, depth=0, MAX_DEPTH=20):
if depth > MAX_DEPTH:
return df1
df2 = pd.DataFrame([[element.getroottree().getpath(element), element.attrib, element.text]],
columns=["path", "attrib", "text"])
#print(df2)
df1 = pd.concat([df1, df2])
for child in element.getchildren():
df1 = recursive_parseXML2DF(child, df1, depth=depth + 1)
return df1
The code for the function was adapted from this post.
Most of the times the function works fine and returns the entire path but for some documents the returned path looks like this:
/*/*[1]/*[3]
/*/*[1]/*[3]/*[1]
The text tag entry remains valid and correct.
The only difference in the XML between working path and widlcard path documents I can make out is that the XML tags are written in all caps.
Working example:
<?xml version="1.0" encoding="utf-8"?>
<root>
<Header>
<ReceivingApplication>ReceivingApplication</ReceivingApplication>
<SendingApplication>SendingApplication</SendingApplication>
<MessageControlID>12345</MessageControlID>
<ReceivingApplication>ReceivingApplication</ReceivingApplication>
<FileCreationDate>2000-01-01T00:00:00</FileCreationDate>
</Header>
<Einsendung>
<Patient>
<PatientName>Name</PatientName>
<PatientVorname>FirstName</PatientVorname>
<PatientGebDat>2000-01-01T00:00:00</PatientGebDat>
<PatientSex>4</PatientSex>
<PatientPWID>123456</PatientPWID>
</Patient>
<Visit>
<VisitNumber>A2000.0001</VisitNumber>
<PatientPLZ>1234</PatientPLZ>
<PatientOrt>PatientOrt</PatientOrt>
<PatientAdr2>
</PatientAdr2>
<PatientStrasse>PatientStrasse 01</PatientStrasse>
<VisitEinsID>1234</VisitEinsID>
<VisitBefund>VisitBefund</VisitBefund>
<Befunddatum>2000-01-01T00:00:00</Befunddatum>
</Visit>
</Einsendung>
</root>
nonsensical Example:
<?xml version="1.0"?>
<KRSCHWEIZ xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="krSCHWEIZ">
<KEY_VS>abcdefg</KEY_VS>
<KEY_KLR>abcdefg</KEY_KLR>
<ABSENDER>
<ABSENDER_MELDER_ID>123456</ABSENDER_MELDER_ID>
<MELDER>
<MELDER_ID>123456</MELDER_ID>
<QUELLSYSTEM>ABCDEF</QUELLSYSTEM>
<PATIENT>
<REFERENZNR>987654</REFERENZNR>
<NACHNAME>my name</NACHNAME>
<VORNAMEN>my first name</VORNAMEN>
<GEBURTSNAME />
<GEBURTSDATUM>my dob</GEBURTSDATUM>
<GESCHLECHT>XX</GESCHLECHT>
<PLZ>9999</PLZ>
<WOHNORT>Mycity</WOHNORT>
<STRASSE>mystreet</STRASSE>
<HAUSNR>99</HAUSNR>
<VERSICHERTENNR>999999999</VERSICHERTENNR>
<DATEIEN>
<DATEI>
<DATEINAME>my_attached_document.html</DATEINAME>
<DATEIBASE64>mybase_64_encoded_document</DATEIBASE64>
</DATEI>
</DATEIEN>
</PATIENT>
</MELDER>
</ABSENDER>
</KRSCHWEIZ>
How do I get correct explicit path information also for this case?
The prescence of namespaces changes the output of .getpath() - you can use .getelementpath() instead which will include the namespace prefix instead of using wildcards.
If the prefix should be discarded completely - you can strip them out before using .getpath()
import lxml.etree
import pandas as pd
rows = []
tree = lxml.etree.parse("broken.xml")
for node in tree.iter():
try:
node.tag = lxml.etree.QName(node).localname
except ValueError:
# skip tags with no name
continue
rows.append([tree.getpath(node), node.attrib, node.text])
df = pd.DataFrame(rows, columns=["path", "attrib", "text"])
Resulting dataframe:
>>> df
path attrib text
0 /KRSCHWEIZ [] \n
1 /KRSCHWEIZ/KEY_VS [] abcdefg
2 /KRSCHWEIZ/KEY_KLR [] abcdefg
3 /KRSCHWEIZ/ABSENDER [] \n
4 /KRSCHWEIZ/ABSENDER/ABSENDER_MELDER_ID [] 123456
5 /KRSCHWEIZ/ABSENDER/MELDER [] \n
6 /KRSCHWEIZ/ABSENDER/MELDER/MELDER_ID [] 123456
7 /KRSCHWEIZ/ABSENDER/MELDER/QUELLSYSTEM [] ABCDEF
8 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT [] \n
9 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/REFERENZNR [] 987654
10 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/NACHNAME [] my name
11 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/VORNAMEN [] my first name
12 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/GEBURTSNAME [] None
13 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/GEBURTSDATUM [] my dob
14 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/GESCHLECHT [] XX
15 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/PLZ [] 9999
16 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/WOHNORT [] Mycity
17 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/STRASSE [] mystreet
18 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/HAUSNR [] 99
19 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/VERSICHERTENNR [] 999999999
20 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN [] \n
21 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN/DATEI [] \n
22 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN/DAT... [] my_attached_document.html
23 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN/DAT... [] mybase_64_encoded_document

Python XML comparison is failing due to extra element tag in one of the XMLs

I have a script which is comparing two XMLs. Comparison is working fine if all the element tags are the same under <account> tag but after adding an extra tag <branchID5> in b.xml for account# 600789488 then it is not printing the differences.
a.xml
<svc>
<accounts>
<account>
<acctBasicInfo>
<acctName>600789488</acctName>
<branchID2>56</branchID2>
<realparties>
<realparty>
<realname>lui</realname>
</realparty>
</realparties>
</acctBasicInfo>
</account>
<account>
<acctBasicInfo>
<acctName>44646</acctName>
<branchID2>86</branchID2>
<realparties>
<realparty>
<realname>lui</realname>
</realparty>
</realparties>
</acctBasicInfo>
</account>
</accounts>
</svc>
b.xml
<svc>
<accounts>
<account>
<acctBasicInfo>
<acctName>44646</acctName>
<branchID2>86</branchID2>
<realparties>
<realparty>
<realname>lui</realname>
</realparty>
</realparties>
</acctBasicInfo>
</account>
<account>
<acctBasicInfo>
<acctName>600789488</acctName>
<branchID2>56</branchID2>
<branchID5>66</branchID5>
<realparties>
<realparty>
<realname>lu</realname>
</realparty>
</realparties>
</acctBasicInfo>
</account>
</accounts>
</svc>
code:
from lxml import etree
from collections import defaultdict
from pprintpp import pprint as pp
root_1 = etree.parse('a.xml').getroot()
root_2 = etree.parse('b.xml').getroot()
d1, d2 = [], []
for node in root_1.findall('.//account'):
item = defaultdict(list)
for x in node.iter():
for k, v in x.attrib.items():
item[k].append(v)
if x.text is None:
item[x.tag].append('None')
elif x.text.strip():
item[x.tag].append(x.text.strip())
d1.append(dict(item))
for node in root_2.findall('.//account'):
item = defaultdict(list)
for x in node.iter():
for k, v in x.attrib.items():
item[k].append(v)
if x.text is None:
item[x.tag].append('None')
elif x.text.strip():
item[x.tag].append(x.text.strip())
d2.append(dict(item))
d1 = sorted(d1, key = lambda x: x['acctName'])
d2 = sorted(d2, key = lambda x: x['acctName'])
print(d1)
print(d2)
res_dict = defaultdict(list)
for x, y in zip(d1, d2):
for key1, key2 in zip(x.keys(), y.keys()):
if (key1 == key2) and sorted(x[key1]) != sorted(y[key2]):
a =set(x[key1])
b = set(y[key2])
diff = ([(i+'--'+'test1.xml') if i in a else (i+'--'+'test2.xml') if i in b else '' for i in list(a^b)])
res_dict[x['acctName'][0]].append({key1: diff})
if res_dict == {}:
print('Data is same in both XML files')
else:
pp(dict(res_dict))
Current output: It is not finding the differences. because branchID5': ['66'] is coming before different realname': ['lu'] in d2
d1:
[{'acctName': ['44646'], 'branchID2': ['86'], 'realname': ['lui']}, {'acctName': ['600789488'], 'branchID2': ['56'], 'realname': ['lui']}]
d2:
[{'acctName': ['44646'], 'branchID2': ['86'], 'realname': ['lui']}, {'acctName': ['600789488'], 'branchID2': ['56'], 'branchID5': ['66'], 'realname': ['lu']}]
Data is same in both XML files
Expected output: It should print the differences. It should ignore the uncommon element tags from both the xmls
{'600789488': [{'realname': ['lui--test1.xml', 'lu--test2.xml']}]}
I believe you made it a little more complicated than absolutely necessary. Since you are using etree, you might as well use xpath to get there.
names1 = root1.xpath('.//account/acctBasicInfo')
for name in names1:
rn = name.xpath('.//realname/text()')[0] #get the real name in root1
actNm = name.xpath('./acctName/text()')[0] #get the acctName in root1
#next line is the key: create a search expression to find in root2 an account with the same acctName as in the current node of root1
exp = f'.//account/acctBasicInfo[acctName/text()={actNm}]//realname/text()'
twin = root2.xpath(exp)[0] #execute the search
#now compare the real names in both accounts in the two roots, and if not the same, create alert
if rn != twin:
print({f'{actNm}': [{'realname': [f'{rn}--test1.xml', f'{twin}--test2.xml']}]})
Output:
{'600789488': [{'realname': ['lui--test1.xml', 'lu--test2.xml']}]}

parse xml to pandas data frame in python

I am trying to read the XML file and convert it to pandas. However it returns empty data
This is the sample of xml structure:
<Instance ID="1">
<MetaInfo StudentID ="DTSU040" TaskID="LP03_PR09.bLK.sh" DataSource="DeepTutorSummer2014"/>
<ProblemDescription>A car windshield collides with a mosquito, squashing it.</ProblemDescription>
<Question>How does this work tion?</Question>
<Answer>tthis is my best </Answer>
<Annotation Label="correct(0)|correct_but_incomplete(1)|contradictory(0)|incorrect(0)">
<AdditionalAnnotation ContextRequired="0" ExtraInfoInAnswer="0"/>
<Comments Watch="1"> The student forgot to tell the opposite force. Opposite means opposite direction, which is important here. However, one can argue that the opposite is implied. See the reference answers.</Comments>
</Annotation>
<ReferenceAnswers>
1: Since the windshield exerts a force on the mosquito, which we can call action, the mosquito exerts an equal and opposite force on the windshield, called the reaction.
</ReferenceAnswers>
</Instance>
I have tried this code, however it's not working on my side. It returns empty dataframe.
import pandas as pd
import xml.etree.ElementTree as et
xtree = et.parse("grade_data.xml")
xroot = xtree.getroot()
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", 'Question', 'Answer',
'ContextRequired', 'ExtraInfoInAnswer', 'Comments', 'Watch', 'ReferenceAnswers']
rows = []
for node in xroot:
s_name = node.attrib.get("ID")
s_student = node.find("StudentID")
s_task = node.find("TaskID")
s_source = node.find("DataSource")
s_desc = node.find("ProblemDescription")
s_question = node.find("Question")
s_ans = node.find("Answer")
s_label = node.find("Label")
s_contextrequired = node.find("ContextRequired")
s_extraInfoinAnswer = node.find("ExtraInfoInAnswer")
s_comments = node.find("Comments")
s_watch = node.find("Watch")
s_referenceAnswers = node.find("ReferenceAnswers")
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers,
})
out_df = pd.DataFrame(rows, columns = df_cols)
The problem in your solution was that the "element data extraction" was not done properly. The xml you mentioned in the question is nested in several layers. And that is why we need to recursively read and extract the data. The following solution should give you what you need in this case. Although I would encourage you to look at this article and the python documentation for more clarity.
Method: 1
import numpy as np
import pandas as pd
#import os
import xml.etree.ElementTree as ET
def xml2df(xml_source, df_cols, source_is_file = False, show_progress=True):
"""Parse the input XML source and store the result in a pandas
DataFrame with the given columns.
For xml_source = xml_file, Set: source_is_file = True
For xml_source = xml_string, Set: source_is_file = False
<element attribute_key1=attribute_value1, attribute_key2=attribute_value2>
<child1>Child 1 Text</child1>
<child2>Child 2 Text</child2>
<child3>Child 3 Text</child3>
</element>
Note that for an xml structure as shown above, the attribute information of
element tag can be accessed by list(element). Any text associated with <element> tag can be accessed
as element.text and the name of the tag itself can be accessed with
element.tag.
"""
if source_is_file:
xtree = ET.parse(xml_source) # xml_source = xml_file
xroot = xtree.getroot()
else:
xroot = ET.fromstring(xml_source) # xml_source = xml_string
consolidator_dict = dict()
default_instance_dict = {label: None for label in df_cols}
def get_children_info(children, instance_dict):
# We avoid using element.getchildren() as it is deprecated.
# Instead use list(element) to get a list of attributes.
for child in children:
#print(child)
#print(child.tag)
#print(child.items())
#print(child.getchildren()) # deprecated method
#print(list(child))
if len(list(child))>0:
instance_dict = get_children_info(list(child),
instance_dict)
if len(list(child.keys()))>0:
items = child.items()
instance_dict.update({key: value for (key, value) in items})
#print(child.keys())
instance_dict.update({child.tag: child.text})
return instance_dict
# Loop over all instances
for instance in list(xroot):
instance_dict = default_instance_dict.copy()
ikey, ivalue = instance.items()[0] # The first attribute is "ID"
instance_dict.update({ikey: ivalue})
if show_progress:
print('{}: {}={}'.format(instance.tag, ikey, ivalue))
# Loop inside every instance
instance_dict = get_children_info(list(instance),
instance_dict)
#consolidator_dict.update({ivalue: instance_dict.copy()})
consolidator_dict[ivalue] = instance_dict.copy()
df = pd.DataFrame(consolidator_dict).T
df = df[df_cols]
return df
Run the following to generate the desired output.
xml_source = r'grade_data.xml'
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", "Question", "Answer",
"ContextRequired", "ExtraInfoInAnswer", "Comments", "Watch", 'ReferenceAnswers']
df = xml2df(xml_source, df_cols, source_is_file = True)
df
Method: 2
Given you have the xml_string, you could convert xml >> dict >> dataframe. run the following to get the desired output.
Note: You will need to install xmltodict to use Method-2. This method is inspired by the solution suggested by #martin-blech at How to convert XML to JSON in Python? [duplicate]
. Kudos to #martin-blech for making it.
pip install -U xmltodict
Solution
def read_recursively(x, instance_dict):
#print(x)
txt = ''
for key in x.keys():
k = key.replace("#","")
if k in df_cols:
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
#else:
instance_dict.update({k: x.get(key)})
#print('{}: {}'.format(k, x.get(key)))
else:
#print('else: {}: {}'.format(k, x.get(key)))
# dig deeper if value is another dict
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
# add simple text associated with element
if k=='#text':
txt = x.get(key)
# update text to corresponding parent element
if (k!='#text') and (txt!=''):
instance_dict.update({k: txt})
return (instance_dict, txt)
You will need the function read_recursively() given above. Now run the following.
import xmltodict, json
o = xmltodict.parse(xml_string) # INPUT: XML_STRING
#print(json.dumps(o)) # uncomment to see xml to json converted string
consolidated_dict = dict()
oi = o['Instances']['Instance']
for x in oi:
instance_dict = dict()
instance_dict, _ = read_recursively(x, instance_dict)
consolidated_dict.update({x.get("#ID"): instance_dict.copy()})
df = pd.DataFrame(consolidated_dict).T
df = df[df_cols]
df
Several issues:
Calling .find on the loop variable, node, expects a child node to exist: current_node.find('child_of_current_node'). However, since all the nodes are the children of root they do not maintain their own children, so no loop is required;
Not checking NoneType that can result from missing nodes with find() and prevents retrieving .tag or .text or other attributes;
Not retrieving node content with .text, otherwise the <Element... object is returned;
Consider this adjustment using the ternary condition expression a if condition else b to ensure variable has a value regardless:
rows = []
s_name = xroot.attrib.get("ID")
s_student = xroot.find("StudentID").text if xroot.find("StudentID") is not None else None
s_task = xroot.find("TaskID").text if xroot.find("TaskID") is not None else None
s_source = xroot.find("DataSource").text if xroot.find("DataSource") is not None else None
s_desc = xroot.find("ProblemDescription").text if xroot.find("ProblemDescription") is not None else None
s_question = xroot.find("Question").text if xroot.find("Question") is not None else None
s_ans = xroot.find("Answer").text if xroot.find("Answer") is not None else None
s_label = xroot.find("Label").text if xroot.find("Label") is not None else None
s_contextrequired = xroot.find("ContextRequired").text if xroot.find("ContextRequired") is not None else None
s_extraInfoinAnswer = xroot.find("ExtraInfoInAnswer").text if xroot.find("ExtraInfoInAnswer") is not None else None
s_comments = xroot.find("Comments").text if xroot.find("Comments") is not None else None
s_watch = xroot.find("Watch").text if xroot.find("Watch") is not None else None
s_referenceAnswers = xroot.find("ReferenceAnswers").text if xroot.find("ReferenceAnswers") is not None else None
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers
})
out_df = pd.DataFrame(rows, columns = df_cols)
Alternatively, run a more dynamic version assigning to an inner dictionary using the iterator variable:
rows = []
for node in xroot:
inner = {}
inner[node.tag] = node.text
rows.append(inner)
out_df = pd.DataFrame(rows, columns = df_cols)
Or list/dict comprehension:
rows = [{node.tag: node.text} for node in xroot]
out_df = pd.DataFrame(rows, columns = df_cols)

extract xml to pandas dataframe with unknown number of nodes

The below code sample works if there is only one node.
However, our use case we dont know how many nodes we will receive
Convert a xml to pandas data frame python
Sample as below.
How we can parse this into dataframe
In particular, we dont know how manby
we will received in the feed file
<?xml version = '1.0' encoding = 'UTF-8'?>
<EVENT spec="IDL:com/RfcCallEvents:1.0#Z_BAPI_UPDT_SERV_NOTIFICATION">
<eventHeader>
<objectName/>
<objectKey/>
<eventName/>
<eventId/>
</eventHeader>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>2</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>1</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_HEADER_DATA>
<QMNUM>030334920069</QMNUM>
<ZGSXREF>CONSUMER</ZGSXREF>
<ZVANTREF>G338005317</ZVANTREF>
<ZSHIPER/>
<ZSHPRNO/>
<ZRVREF/>
<ZTECHID>4HQ2OD6C19</ZTECHID>
<ZADREPAIR/>
<ZZKATR7/>
</TAB_HEADER_DATA>
</EVENT>
I suspect you need to parse xml-data to several dataframes, e.g. as follows:
import xmltodict # install this module first
data = """<?xml version = '1.0' encoding = 'UTF-8'?>
<EVENT spec="IDL:com/RfcCallEvents:1.0#Z_BAPI_UPDT_SERV_NOTIFICATION">
<eventHeader>
<objectName/>
<objectKey/>
<eventName/>
<eventId/>
</eventHeader>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>2</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>1</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_HEADER_DATA>
<QMNUM>030334920069</QMNUM>
<ZGSXREF>CONSUMER</ZGSXREF>
<ZVANTREF>G338005317</ZVANTREF>
<ZSHIPER/>
<ZSHPRNO/>
<ZRVREF/>
<ZTECHID>4HQ2OD6C19</ZTECHID>
<ZADREPAIR/>
<ZZKATR7/>
</TAB_HEADER_DATA>
</EVENT>"""
dct = xmltodict.parse(data)
def make_df(name="TAB_DETAIL_DATA", dct=dct):
df = pd.DataFrame()
if isinstance(dct['EVENT'][name], list):
for j in dct['EVENT'][name]:
_ = pd.DataFrame({'value': [y for x, y in j.items()]}, index=j.keys())
df = pd.concat([df, _])
else:
df = pd.DataFrame({'value': [y for x, y in dct['EVENT'][name].items()]}, index=dct['EVENT'][name].keys())
return df
Now, you can experiment with the parser:
make_df(name="TAB_HEADER_DATA") # produces single df
make_df(name="TAB_DETAIL_DATA") # concatenates all content occurred in TAB_DETAIL_DATA sections, returns single df

Categories

Resources