How to convert XML data as a pandas data frame?

How to convert XML data as a pandas data frame? - python

I'm trying to analysis XML file with python. I ned to get xml data as a pandas data frame.
import pandas as pd
import xml.etree.ElementTree as et
def parse_XML(xml_file, df_cols):
xtree = et.parse(xml_file)
xroot = xtree.getroot()
rows = []
for node in xroot:
res = []
res.append(node.attrib.get(df_cols[0]))
for el in df_cols[1:]:
if node is not None and node.find(el) is not None:
res.append(node.find(el).text)
else:
res.append(None)
rows.append({df_cols[i]: res[i]
for i, _ in enumerate(df_cols)})
out_df = pd.DataFrame(rows, columns=df_cols)
return out_df
parse_XML('/Users/newuser/Desktop/TESTRATP/arrets.xml', ["Name","gml"])
But I'm getting below data frame.
Name gml
0 None None
1 None None
2 None None
My XML file is :
<?xml version="1.0" encoding="UTF-8"?>
<PublicationDelivery version="1.09:FR-NETEX_ARRET-2.1-1.0" xmlns="http://www.netex.org.uk/netex" xmlns:core="http://www.govtalk.gov.uk/core" xmlns:gml="http://www.opengis.net/gml/3.2" xmlns:ifopt="http://www.ifopt.org.uk/ifopt" xmlns:siri="http://www.siri.org.uk/siri" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.netex.org.uk/netex">
<PublicationTimestamp>2020-08-05T06:00:01+00:00</PublicationTimestamp>
<ParticipantRef>transport.data.gouv.fr</ParticipantRef>
<dataObjects>
<GeneralFrame id="FR:GeneralFrame:NETEX_ARRET:" version="any">
<members>
<Quay id="FR:Quay:zenbus_StopPoint_SP_351400003_LOC:" version="any">
<Name>ST FELICIEN - Centre</Name>
<Centroid>
<Location>
<gml:pos srsName="EPSG:2154">828054.2068251468 6444393.512041969</gml:pos>
</Location>
</Centroid>
<TransportMode>bus</TransportMode>
</Quay>
<Quay id="FR:Quay:zenbus_StopPoint_SP_361350004_LOC:" version="any">
<Name>ST FELICIEN - Chemin de Juny</Name>
<Centroid>
<Location>
<gml:pos srsName="EPSG:2154">828747.3172982805 6445226.100290826</gml:pos>
</Location>
</Centroid>
<TransportMode>bus</TransportMode>
</Quay>
<Quay id="FR:Quay:zenbus_StopPoint_SP_343500005_LOC:" version="any">
<Name>ST FELICIEN - Darone</Name>
<Centroid>
<Location>
<gml:pos srsName="EPSG:2154">829036.2709757038 6444724.878001894</gml:pos>
</Location>
</Centroid>
<TransportMode>bus</TransportMode>
</Quay>
<Quay id="FR:Quay:zenbus_StopPoint_SP_359440004_LOC:" version="any">
<Name>ST FELICIEN - Col de Fontayes</Name>
<Centroid>
<Location>
<gml:pos srsName="EPSG:2154">829504.7993360173 6445490.57188837</gml:pos>
</Location>
</Centroid>
<TransportMode>bus</TransportMode>
</Quay>
</members>
</GeneralFrame>
</dataObjects>
</PublicationDelivery>
I gave you here little part of my xml file. I can't give you full XML file as it exceeding the character limits in stackoverflow. I'm wondering why I got above output and I don't know where the my error is. As I'm new to this please some one can help me? Thank you

My approach is avoid xml parsing and switch straight into pandas by using xmlplain to generate JSON from xml.
import xmlplain
with open("so_sample.xml") as f: js = xmlplain.xml_to_obj(f, strip_space=True, fold_dict=True)
df1 = pd.json_normalize(js).explode("PublicationDelivery.dataObjects.GeneralFrame.members")
# cleanup column names...
df1 = df1.rename(columns={c:c.replace("PublicationDelivery.", "").replace("dataObjects.GeneralFrame.","").strip()
for c in df1.columns})
# drop spurious columns
df1 = df1.drop(columns=[c for c in df1.columns if c[0]=="#"])
# expand second level of dictionaries
df1 = pd.json_normalize(df1.to_dict(orient="records"))
# cleanup columns from second set of dictionaries
df1 = df1.rename(columns={c:c.replace("members.Quay.", "") for c in df1.columns})
# expand next list and dicts
df1 = pd.json_normalize(df1.explode("Centroid.Location.gml:pos").to_dict(orient="records"))
# there are some NaNs - dela with them
df1["Centroid.Location.gml:pos.#srsName"].fillna(method="ffill", inplace=True)
df1["Centroid.Location.gml:pos"].fillna(method="bfill", inplace=True)
# de-dup
df1 = df1.groupby("#id", as_index=False).first()
# more columns than requested... for SO output
print(df1.loc[:,["Name", "Centroid.Location.gml:pos.#srsName", "Centroid.Location.gml:pos"]].to_string(index=False))
output
Name Centroid.Location.gml:pos.#srsName Centroid.Location.gml:pos
ST FELICIEN - Darone EPSG:2154 829036.2709757038 6444724.878001894
ST FELICIEN - Centre EPSG:2154 828054.2068251468 6444393.512041969
ST FELICIEN - Col de Fontayes EPSG:2154 829504.7993360173 6445490.57188837
ST FELICIEN - Chemin de Juny EPSG:2154 828747.3172982805 6445226.100290826

Alternative solution using pandas-read-xml
pip install pandas-read-xml
import pandas_read_xml as pdx
from pandas_read_xml import fully_flatten
df = pdx.read_xml(xml, ['PublicationDelivery', 'dataObjects', 'GeneralFrame', 'members']).pipe(fully_flatten)
The list is just the tags that you want to navigate to as the "root".
You many need to clean the column names afterwards.

Related

XML into Pandas dataframe

I have an XML file and I would like to parse it into a table. (Pandas dataframe)
Below is just a sample of the XML file. Those are only two of the records.
<?xml version="1.0" encoding="UTF-8"?>
<file>
<C13_335010X321A1_837Y6>
<BHT_BeginningOfHierarchicalTransaction>
<BHT01__HierarchicalStructureCode>0011</BHT01__HierarchicalStructureCode>
<BHT02__TransactionSetPurposeCode>00</BHT02__TransactionSetPurposeCode>
<BHT03__OriginatorApplicationTransactionIdentifier>513513TR</BHT03__OriginatorApplicationTransactionIdentifier>
<BHT04__TransactionSetCreationDate>20200212</BHT04__TransactionSetCreationDate>
<BHT05__TransactionSetCreationTime>1287</BHT05__TransactionSetCreationTime>
<BHT06__ClaimOrEncounterIdentifier>DD</BHT06__ClaimOrEncounterIdentifier>
</BHT_BeginningOfHierarchicalTransaction>
<Loop_1000A>
<NM1_SubmitterName_1000A>
<NM101__EntityIdentifierCode>27</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>9</NM102__EntityTypeQualifier>
<NM103__SubmitterLastOrOrganizationName>AAA</NM103__SubmitterLastOrOrganizationName>
<NM108__IdentificationCodeQualifier>22</NM108__IdentificationCodeQualifier>
<NM109__SubmitterIdentifier>55555500</NM109__SubmitterIdentifier>
</NM1_SubmitterName_1000A>
<PER_SubmitterEDIContactInformation_1000A>
<PER01__ContactFunctionCode>LK</PER01__ContactFunctionCode>
<PER02__SubmitterContactName>John Smith</PER02__SubmitterContactName>
<PER03__CommunicationNumberQualifier>WW</PER03__CommunicationNumberQualifier>
<PER04__CommunicationNumber>2132220011</PER04__CommunicationNumber>
<PER05__CommunicationNumberQualifier>DD</PER05__CommunicationNumberQualifier>
<PER06__CommunicationNumber>DD_2#GMAIL.COM</PER06__CommunicationNumber>
</PER_SubmitterEDIContactInformation_1000A>
</Loop_1000A>
<Loop_1000B>
<NM1_ReceiverName_1000B>
<NM101__EntityIdentifierCode>21</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>0</NM102__EntityTypeQualifier>
<NM103__ReceiverName>AAA</NM103__ReceiverName>
<NM108__IdentificationCodeQualifier>32</NM108__IdentificationCodeQualifier>
<NM109__ReceiverPrimaryIdentifier>2514521</NM109__ReceiverPrimaryIdentifier>
</NM1_ReceiverName_1000B>
</Loop_1000B>
<Loop_2000A>
<HL_BillingProviderHierarchicalLevel_2000A>
<HL01__HierarchicalIDNumber>32</HL01__HierarchicalIDNumber>
<HL03__HierarchicalLevelCode>54</HL03__HierarchicalLevelCode>
<HL04__HierarchicalChildCode>32</HL04__HierarchicalChildCode>
</HL_BillingProviderHierarchicalLevel_2000A>
<Loop_2010AA>
<NM1_BillingProviderName_2010AA>
<NM101__EntityIdentifierCode>54</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>21</NM102__EntityTypeQualifier>
<NM103__BillingProviderLastOrOrganizationalName>AAA</NM103__BillingProviderLastOrOrganizationalName>
<NM108__IdentificationCodeQualifier>XX</NM108__IdentificationCodeQualifier>
<NM109__BillingProviderIdentifier>515151325</NM109__BillingProviderIdentifier>
</NM1_BillingProviderName_2010AA>
<N3_BillingProviderAddress_2010AA>
<N301__BillingProviderAddressLine>214 SS STREET</N301__BillingProviderAddressLine>
</N3_BillingProviderAddress_2010AA>
<N4_BillingProviderCityStateZIPCode_2010AA>
<N401__BillingProviderCityName>LA</N401__BillingProviderCityName>
<N402__BillingProviderStateOrProvinceCode>CA</N402__BillingProviderStateOrProvinceCode>
<N403__BillingProviderPostalZoneOrZIPCode>93500</N403__BillingProviderPostalZoneOrZIPCode>
</N4_BillingProviderCityStateZIPCode_2010AA>
<REF_BillingProviderTaxIdentification_2010AA>
<REF01__ReferenceIdentificationQualifier>OI</REF01__ReferenceIdentificationQualifier>
<REF02__BillingProviderTaxIdentificationNumber>5135151315</REF02__BillingProviderTaxIdentificationNumber>
</REF_BillingProviderTaxIdentification_2010AA>
</Loop_2010AA>
<Loop_2000B>
<HL_SubscriberHierarchicalLevel_2000B>
<HL01__HierarchicalIDNumber>5</HL01__HierarchicalIDNumber>
<HL02__HierarchicalParentIDNumber>5</HL02__HierarchicalParentIDNumber>
<HL03__HierarchicalLevelCode>55</HL03__HierarchicalLevelCode>
<HL04__HierarchicalChildCode>5</HL04__HierarchicalChildCode>
</HL_SubscriberHierarchicalLevel_2000B>
<SBR_SubscriberInformation_2000B>
<SBR01__PayerResponsibilitySequenceNumberCode>L</SBR01__PayerResponsibilitySequenceNumberCode>
<SBR02__IndividualRelationshipCode>32</SBR02__IndividualRelationshipCode>
<SBR03__SubscriberGroupOrPolicyNumber>252525Z125</SBR03__SubscriberGroupOrPolicyNumber>
<SBR09__ClaimFilingIndicatorCode>NM</SBR09__ClaimFilingIndicatorCode>
</SBR_SubscriberInformation_2000B>
<Loop_2010BA>
<NM1_SubscriberName_2010BA>
<NM101__EntityIdentifierCode>DCX</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>5</NM102__EntityTypeQualifier>
<NM103__SubscriberLastName>SMITH</NM103__SubscriberLastName>
<NM104__SubscriberFirstName>JOHN</NM104__SubscriberFirstName>
<NM108__IdentificationCodeQualifier>CA</NM108__IdentificationCodeQualifier>
<NM109__SubscriberPrimaryIdentifier>3656361.</NM109__SubscriberPrimaryIdentifier>
</NM1_SubscriberName_2010BA>
<N3_SubscriberAddress_2010BA>
<N301__SubscriberAddressLine>111 STREET</N301__SubscriberAddressLine>
</N3_SubscriberAddress_2010BA>
<N4_SubscriberCityStateZIPCode_2010BA>
<N401__SubscriberCityName>LA</N401__SubscriberCityName>
<N402__SubscriberStateCode>CA</N402__SubscriberStateCode>
<N403__SubscriberPostalZoneOrZIPCode>93000</N403__SubscriberPostalZoneOrZIPCode>
</N4_SubscriberCityStateZIPCode_2010BA>
<DMG_SubscriberDemographicInformation_2010BA>
<DMG01__DateTimePeriodFormatQualifier>K5</DMG01__DateTimePeriodFormatQualifier>
<DMG02__SubscriberBirthDate>19851010</DMG02__SubscriberBirthDate>
<DMG03__SubscriberGenderCode>U</DMG03__SubscriberGenderCode>
</DMG_SubscriberDemographicInformation_2010BA>
</Loop_2010BA>
<Loop_2010BB>
<NM1_PayerName_2010BB>
<NM101__EntityIdentifierCode>FF</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>3</NM102__EntityTypeQualifier>
<NM103__PayerName>AAA</NM103__PayerName>
<NM108__IdentificationCodeQualifier>GF</NM108__IdentificationCodeQualifier>
<NM109__PayerIdentifier>32514</NM109__PayerIdentifier>
</NM1_PayerName_2010BB>
</Loop_2010BB>
<Loop_2300>
<CLM_ClaimInformation_2300>
<CLM01__PatientControlNumber>5413</CLM01__PatientControlNumber>
<CLM02__TotalClaimChargeAmount>651</CLM02__TotalClaimChargeAmount>
<CLM05_HealthCareServiceLocationInformation_2300>
<CLM05_01_PlaceOfServiceCode>13</CLM05_01_PlaceOfServiceCode>
<CLM05_02_FacilityCodeQualifier>D</CLM05_02_FacilityCodeQualifier>
<CLM05_03_ClaimFrequencyCode>3</CLM05_03_ClaimFrequencyCode>
</CLM05_HealthCareServiceLocationInformation_2300>
<CLM06__ProviderOrSupplierSignatureIndicator>N</CLM06__ProviderOrSupplierSignatureIndicator>
<CLM07__AssignmentOrPlanParticipationCode>R</CLM07__AssignmentOrPlanParticipationCode>
<CLM08__BenefitsAssignmentCertificationIndicator>N</CLM08__BenefitsAssignmentCertificationIndicator>
<CLM09__ReleaseOfInformationCode>N</CLM09__ReleaseOfInformationCode>
<CLM10__PatientSignatureSourceCode>X</CLM10__PatientSignatureSourceCode>
</CLM_ClaimInformation_2300>
<REF_ClaimIdentifierForTransmissionIntermediaries_2300>
<REF01__ReferenceIdentificationQualifier>J1</REF01__ReferenceIdentificationQualifier>
<REF02__ValueAddedNetworkTraceNumber>FVC2514543254</REF02__ValueAddedNetworkTraceNumber>
</REF_ClaimIdentifierForTransmissionIntermediaries_2300>
<HI_HealthCareDiagnosisCode_2300>
<HI01_HealthCareCodeInformation_2300>
<HI01_01_DiagnosisTypeCode>CCC</HI01_01_DiagnosisTypeCode>
<HI01_02_DiagnosisCode>N111</HI01_02_DiagnosisCode>
</HI01_HealthCareCodeInformation_2300>
</HI_HealthCareDiagnosisCode_2300>
<Loop_2310B>
<NM1_RenderingProviderName_2310B>
<NM101__EntityIdentifierCode>32</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>2</NM102__EntityTypeQualifier>
<NM103__RenderingProviderLastOrOrganizationName>JOHN</NM103__RenderingProviderLastOrOrganizationName>
<NM104__RenderingProviderFirstName>SMITH</NM104__RenderingProviderFirstName>
<NM108__IdentificationCodeQualifier>TT</NM108__IdentificationCodeQualifier>
<NM109__RenderingProviderIdentifier>25431251</NM109__RenderingProviderIdentifier>
</NM1_RenderingProviderName_2310B>
<PRV_RenderingProviderSpecialtyInformation_2310B>
<PRV01__ProviderCode>TR</PRV01__ProviderCode>
<PRV02__ReferenceIdentificationQualifier>VFD</PRV02__ReferenceIdentificationQualifier>
<PRV03__ProviderTaxonomyCode>135454353L</PRV03__ProviderTaxonomyCode>
</PRV_RenderingProviderSpecialtyInformation_2310B>
</Loop_2310B>
<Loop_2400>
<LX_ServiceLineNumber_2400>
<LX01__AssignedNumber>2</LX01__AssignedNumber>
</LX_ServiceLineNumber_2400>
<SV1_ProfessionalService_2400>
<SV101_CompositeMedicalProcedureIdentifier_2400>
<SV101_01_ProductOrServiceIDQualifier>EE</SV101_01_ProductOrServiceIDQualifier>
<SV101_02_ProcedureCode>99999</SV101_02_ProcedureCode>
<SV101_07_Description>BLOOD</SV101_07_Description>
</SV101_CompositeMedicalProcedureIdentifier_2400>
<SV102__LineItemChargeAmount>200</SV102__LineItemChargeAmount>
<SV103__UnitOrBasisForMeasurementCode>PP</SV103__UnitOrBasisForMeasurementCode>
<SV104__ServiceUnitCount>3.5</SV104__ServiceUnitCount>
<SV107_CompositeDiagnosisCodePointer_2400>
<SV107_01_DiagnosisCodePointer>2</SV107_01_DiagnosisCodePointer>
</SV107_CompositeDiagnosisCodePointer_2400>
</SV1_ProfessionalService_2400>
<DTP_DateServiceDate_2400>
<DTP01__DateTimeQualifier>654</DTP01__DateTimeQualifier>
<DTP02__DateTimePeriodFormatQualifier>U8</DTP02__DateTimePeriodFormatQualifier>
<DTP03__ServiceDate>20191010</DTP03__ServiceDate>
</DTP_DateServiceDate_2400>
<REF_LineItemControlNumber_2400>
<REF01__ReferenceIdentificationQualifier>5F</REF01__ReferenceIdentificationQualifier>
<REF02__LineItemControlNumber>DDD.32.123</REF02__LineItemControlNumber>
</REF_LineItemControlNumber_2400>
</Loop_2400>
</Loop_2300>
</Loop_2000B>
</Loop_2000A>
</C13_335010X321A1_837Y6>
<C13_335010X321A1_837Y6>
<BHT_BeginningOfHierarchicalTransaction>
<BHT01__HierarchicalStructureCode>0011</BHT01__HierarchicalStructureCode>
<BHT02__TransactionSetPurposeCode>00</BHT02__TransactionSetPurposeCode>
<BHT03__OriginatorApplicationTransactionIdentifier>513513TR</BHT03__OriginatorApplicationTransactionIdentifier>
<BHT04__TransactionSetCreationDate>20200212</BHT04__TransactionSetCreationDate>
<BHT05__TransactionSetCreationTime>1287</BHT05__TransactionSetCreationTime>
<BHT06__ClaimOrEncounterIdentifier>DD</BHT06__ClaimOrEncounterIdentifier>
</BHT_BeginningOfHierarchicalTransaction>
<Loop_1000A>
<NM1_SubmitterName_1000A>
<NM101__EntityIdentifierCode>27</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>9</NM102__EntityTypeQualifier>
<NM103__SubmitterLastOrOrganizationName>AAA</NM103__SubmitterLastOrOrganizationName>
<NM108__IdentificationCodeQualifier>22</NM108__IdentificationCodeQualifier>
<NM109__SubmitterIdentifier>55555500</NM109__SubmitterIdentifier>
</NM1_SubmitterName_1000A>
<PER_SubmitterEDIContactInformation_1000A>
<PER01__ContactFunctionCode>LK</PER01__ContactFunctionCode>
<PER02__SubmitterContactName>John Smith</PER02__SubmitterContactName>
<PER03__CommunicationNumberQualifier>WW</PER03__CommunicationNumberQualifier>
<PER04__CommunicationNumber>2132220011</PER04__CommunicationNumber>
<PER05__CommunicationNumberQualifier>DD</PER05__CommunicationNumberQualifier>
<PER06__CommunicationNumber>DD_2#GMAIL.COM</PER06__CommunicationNumber>
</PER_SubmitterEDIContactInformation_1000A>
</Loop_1000A>
<Loop_1000B>
<NM1_ReceiverName_1000B>
<NM101__EntityIdentifierCode>21</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>0</NM102__EntityTypeQualifier>
<NM103__ReceiverName>AAA</NM103__ReceiverName>
<NM108__IdentificationCodeQualifier>32</NM108__IdentificationCodeQualifier>
<NM109__ReceiverPrimaryIdentifier>2514521</NM109__ReceiverPrimaryIdentifier>
</NM1_ReceiverName_1000B>
</Loop_1000B>
<Loop_2000A>
<HL_BillingProviderHierarchicalLevel_2000A>
<HL01__HierarchicalIDNumber>32</HL01__HierarchicalIDNumber>
<HL03__HierarchicalLevelCode>54</HL03__HierarchicalLevelCode>
<HL04__HierarchicalChildCode>32</HL04__HierarchicalChildCode>
</HL_BillingProviderHierarchicalLevel_2000A>
<Loop_2010AA>
<NM1_BillingProviderName_2010AA>
<NM101__EntityIdentifierCode>54</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>21</NM102__EntityTypeQualifier>
<NM103__BillingProviderLastOrOrganizationalName>AAA</NM103__BillingProviderLastOrOrganizationalName>
<NM108__IdentificationCodeQualifier>XX</NM108__IdentificationCodeQualifier>
<NM109__BillingProviderIdentifier>515151325</NM109__BillingProviderIdentifier>
</NM1_BillingProviderName_2010AA>
<N3_BillingProviderAddress_2010AA>
<N301__BillingProviderAddressLine>214 SS STREET</N301__BillingProviderAddressLine>
</N3_BillingProviderAddress_2010AA>
<N4_BillingProviderCityStateZIPCode_2010AA>
<N401__BillingProviderCityName>LA</N401__BillingProviderCityName>
<N402__BillingProviderStateOrProvinceCode>CA</N402__BillingProviderStateOrProvinceCode>
<N403__BillingProviderPostalZoneOrZIPCode>93500</N403__BillingProviderPostalZoneOrZIPCode>
</N4_BillingProviderCityStateZIPCode_2010AA>
<REF_BillingProviderTaxIdentification_2010AA>
<REF01__ReferenceIdentificationQualifier>OI</REF01__ReferenceIdentificationQualifier>
<REF02__BillingProviderTaxIdentificationNumber>5135151315</REF02__BillingProviderTaxIdentificationNumber>
</REF_BillingProviderTaxIdentification_2010AA>
</Loop_2010AA>
<Loop_2000B>
<HL_SubscriberHierarchicalLevel_2000B>
<HL01__HierarchicalIDNumber>5</HL01__HierarchicalIDNumber>
<HL02__HierarchicalParentIDNumber>5</HL02__HierarchicalParentIDNumber>
<HL03__HierarchicalLevelCode>55</HL03__HierarchicalLevelCode>
<HL04__HierarchicalChildCode>5</HL04__HierarchicalChildCode>
</HL_SubscriberHierarchicalLevel_2000B>
<SBR_SubscriberInformation_2000B>
<SBR01__PayerResponsibilitySequenceNumberCode>L</SBR01__PayerResponsibilitySequenceNumberCode>
<SBR02__IndividualRelationshipCode>32</SBR02__IndividualRelationshipCode>
<SBR03__SubscriberGroupOrPolicyNumber>252525Z125</SBR03__SubscriberGroupOrPolicyNumber>
<SBR09__ClaimFilingIndicatorCode>NM</SBR09__ClaimFilingIndicatorCode>
</SBR_SubscriberInformation_2000B>
<Loop_2010BA>
<NM1_SubscriberName_2010BA>
<NM101__EntityIdentifierCode>DCX</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>5</NM102__EntityTypeQualifier>
<NM103__SubscriberLastName>SMITH</NM103__SubscriberLastName>
<NM104__SubscriberFirstName>JOHN</NM104__SubscriberFirstName>
<NM108__IdentificationCodeQualifier>CA</NM108__IdentificationCodeQualifier>
<NM109__SubscriberPrimaryIdentifier>3656361.</NM109__SubscriberPrimaryIdentifier>
</NM1_SubscriberName_2010BA>
<N3_SubscriberAddress_2010BA>
<N301__SubscriberAddressLine>111 STREET</N301__SubscriberAddressLine>
</N3_SubscriberAddress_2010BA>
<N4_SubscriberCityStateZIPCode_2010BA>
<N401__SubscriberCityName>LA</N401__SubscriberCityName>
<N402__SubscriberStateCode>CA</N402__SubscriberStateCode>
<N403__SubscriberPostalZoneOrZIPCode>93000</N403__SubscriberPostalZoneOrZIPCode>
</N4_SubscriberCityStateZIPCode_2010BA>
<DMG_SubscriberDemographicInformation_2010BA>
<DMG01__DateTimePeriodFormatQualifier>K5</DMG01__DateTimePeriodFormatQualifier>
<DMG02__SubscriberBirthDate>19851010</DMG02__SubscriberBirthDate>
<DMG03__SubscriberGenderCode>U</DMG03__SubscriberGenderCode>
</DMG_SubscriberDemographicInformation_2010BA>
</Loop_2010BA>
<Loop_2010BB>
<NM1_PayerName_2010BB>
<NM101__EntityIdentifierCode>FF</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>3</NM102__EntityTypeQualifier>
<NM103__PayerName>AAA</NM103__PayerName>
<NM108__IdentificationCodeQualifier>GF</NM108__IdentificationCodeQualifier>
<NM109__PayerIdentifier>32514</NM109__PayerIdentifier>
</NM1_PayerName_2010BB>
</Loop_2010BB>
<Loop_2300>
<CLM_ClaimInformation_2300>
<CLM01__PatientControlNumber>5413</CLM01__PatientControlNumber>
<CLM02__TotalClaimChargeAmount>651</CLM02__TotalClaimChargeAmount>
<CLM05_HealthCareServiceLocationInformation_2300>
<CLM05_01_PlaceOfServiceCode>13</CLM05_01_PlaceOfServiceCode>
<CLM05_02_FacilityCodeQualifier>D</CLM05_02_FacilityCodeQualifier>
<CLM05_03_ClaimFrequencyCode>3</CLM05_03_ClaimFrequencyCode>
</CLM05_HealthCareServiceLocationInformation_2300>
<CLM06__ProviderOrSupplierSignatureIndicator>N</CLM06__ProviderOrSupplierSignatureIndicator>
<CLM07__AssignmentOrPlanParticipationCode>R</CLM07__AssignmentOrPlanParticipationCode>
<CLM08__BenefitsAssignmentCertificationIndicator>N</CLM08__BenefitsAssignmentCertificationIndicator>
<CLM09__ReleaseOfInformationCode>N</CLM09__ReleaseOfInformationCode>
<CLM10__PatientSignatureSourceCode>X</CLM10__PatientSignatureSourceCode>
</CLM_ClaimInformation_2300>
<REF_ClaimIdentifierForTransmissionIntermediaries_2300>
<REF01__ReferenceIdentificationQualifier>J1</REF01__ReferenceIdentificationQualifier>
<REF02__ValueAddedNetworkTraceNumber>FVC2514543254</REF02__ValueAddedNetworkTraceNumber>
</REF_ClaimIdentifierForTransmissionIntermediaries_2300>
<HI_HealthCareDiagnosisCode_2300>
<HI01_HealthCareCodeInformation_2300>
<HI01_01_DiagnosisTypeCode>CCC</HI01_01_DiagnosisTypeCode>
<HI01_02_DiagnosisCode>N111</HI01_02_DiagnosisCode>
</HI01_HealthCareCodeInformation_2300>
</HI_HealthCareDiagnosisCode_2300>
<Loop_2310B>
<NM1_RenderingProviderName_2310B>
<NM101__EntityIdentifierCode>32</NM101__EntityIdentifierCode>
<NM102__EntityTypeQualifier>2</NM102__EntityTypeQualifier>
<NM103__RenderingProviderLastOrOrganizationName>JOHN</NM103__RenderingProviderLastOrOrganizationName>
<NM104__RenderingProviderFirstName>SMITH</NM104__RenderingProviderFirstName>
<NM108__IdentificationCodeQualifier>TT</NM108__IdentificationCodeQualifier>
<NM109__RenderingProviderIdentifier>25431251</NM109__RenderingProviderIdentifier>
</NM1_RenderingProviderName_2310B>
<PRV_RenderingProviderSpecialtyInformation_2310B>
<PRV01__ProviderCode>TR</PRV01__ProviderCode>
<PRV02__ReferenceIdentificationQualifier>VFD</PRV02__ReferenceIdentificationQualifier>
<PRV03__ProviderTaxonomyCode>135454353L</PRV03__ProviderTaxonomyCode>
</PRV_RenderingProviderSpecialtyInformation_2310B>
</Loop_2310B>
<Loop_2400>
<LX_ServiceLineNumber_2400>
<LX01__AssignedNumber>2</LX01__AssignedNumber>
</LX_ServiceLineNumber_2400>
<SV1_ProfessionalService_2400>
<SV101_CompositeMedicalProcedureIdentifier_2400>
<SV101_01_ProductOrServiceIDQualifier>EE</SV101_01_ProductOrServiceIDQualifier>
<SV101_02_ProcedureCode>99999</SV101_02_ProcedureCode>
<SV101_07_Description>BLOOD</SV101_07_Description>
</SV101_CompositeMedicalProcedureIdentifier_2400>
<SV102__LineItemChargeAmount>200</SV102__LineItemChargeAmount>
<SV103__UnitOrBasisForMeasurementCode>PP</SV103__UnitOrBasisForMeasurementCode>
<SV104__ServiceUnitCount>3.5</SV104__ServiceUnitCount>
<SV107_CompositeDiagnosisCodePointer_2400>
<SV107_01_DiagnosisCodePointer>2</SV107_01_DiagnosisCodePointer>
</SV107_CompositeDiagnosisCodePointer_2400>
</SV1_ProfessionalService_2400>
<DTP_DateServiceDate_2400>
<DTP01__DateTimeQualifier>654</DTP01__DateTimeQualifier>
<DTP02__DateTimePeriodFormatQualifier>U8</DTP02__DateTimePeriodFormatQualifier>
<DTP03__ServiceDate>20191010</DTP03__ServiceDate>
</DTP_DateServiceDate_2400>
<REF_LineItemControlNumber_2400>
<REF01__ReferenceIdentificationQualifier>5F</REF01__ReferenceIdentificationQualifier>
<REF02__LineItemControlNumber>DDD.32.123</REF02__LineItemControlNumber>
</REF_LineItemControlNumber_2400>
</Loop_2400>
</Loop_2300>
</Loop_2000B>
</Loop_2000A>
</C13_335010X321A1_837Y6>
</file>
These have to be in two rows, I am using the following python code to convert it into panda data frame, but I am getting empty data frame.
import pandas as pd
import xml.etree.ElementTree as et
def xml_file(file):
columns = file.attrib
for xml in file.iter('C13_335010X321A1_837Y6'):
file_dict = columns.copy()
file_dict.update(xml.attrib)
yield file_dict
tree = et.parse(r"C:\Users\Desktop\test1.xml")
root = tree.getroot()
df = pd.DataFrame(list(xml_file(root)))

What is the best way to parse large XML and genarate a dataframe with the data in the XML (with python or else)?

I try to make a table (or csv, I'm using pandas dataframe) from the information of an XML file.
The file is here (.zip is 14 MB, XML is ~370MB), https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip . It has package information of different languages - node.js, python, java etc. aka, CPE 2.3 list by the US government org NVD.
this is how it looks like in the first 30 rows:
<cpe-list xmlns:config="http://scap.nist.gov/schema/configuration/0.1" xmlns="http://cpe.mitre.org/dictionary/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.3" xmlns:cpe-23="http://scap.nist.gov/schema/cpe-extension/2.3" xmlns:ns6="http://scap.nist.gov/schema/scap-core/0.1" xmlns:meta="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" xsi:schemaLocation="http://scap.nist.gov/schema/cpe-extension/2.3 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary-extension_2.3.xsd http://cpe.mitre.org/dictionary/2.0 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary_2.3.xsd http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2 https://scap.nist.gov/schema/cpe/2.1/cpe-dictionary-metadata_0.2.xsd http://scap.nist.gov/schema/scap-core/0.3 https://scap.nist.gov/schema/nvd/scap-core_0.3.xsd http://scap.nist.gov/schema/configuration/0.1 https://scap.nist.gov/schema/nvd/configuration_0.1.xsd http://scap.nist.gov/schema/scap-core/0.1 https://scap.nist.gov/schema/nvd/scap-core_0.1.xsd">
<generator>
<product_name>National Vulnerability Database (NVD)</product_name>
<product_version>4.9</product_version>
<schema_version>2.3</schema_version>
<timestamp>2022-03-17T03:51:01.909Z</timestamp>
</generator>
<cpe-item name="cpe:/a:%240.99_kindle_books_project:%240.99_kindle_books:6::~~~android~~">
<title xml:lang="en-US">$0.99 Kindle Books project $0.99 Kindle Books (aka com.kindle.books.for99) for android 6.0</title>
<references>
<reference href="https://play.google.com/store/apps/details?id=com.kindle.books.for99">Product information</reference>
<reference href="https://docs.google.com/spreadsheets/d/1t5GXwjw82SyunALVJb2w0zi3FoLRIkfGPc7AMjRF0r4/edit?pli=1#gid=1053404143">Government Advisory</reference>
</references>
<cpe-23:cpe23-item name="cpe:2.3:a:\$0.99_kindle_books_project:\$0.99_kindle_books:6:*:*:*:*:android:*:*"/>
</cpe-item>
The tree structure of the XML file is quite simple, the root is 'cpe-list', the child element is 'cpe-item', and the grandchild elements are 'title', 'references' and 'cpe23-item'.
From 'title', I want the text in the element;
From 'cpe23-item', I want the attribute 'name';
From 'references', I want the attributes 'href' from its great-grandchildren, 'reference'.
The dataframe should look like this:
| cpe23_name | title_text | ref1 | ref2 | ref3 | ref_other
0 | 'cpe23name 1'| 'this is a python pkg'| 'url1'| 'url2'| NaN | NaN
1 | 'cpe23name 2'| 'this is a java pkg' | 'url1'| 'url2'| NaN | NaN
...
my code is here，finished in ~100sec：
import xml.etree.ElementTree as et
xtree = et.parse("official-cpe-dictionary_v2.3.xml")
xroot = xtree.getroot()
import time
start_time = time.time()
df_cols = ["cpe", "text", "vendor", "product", "version", "changelog", "advisory", 'others']
title = '{http://cpe.mitre.org/dictionary/2.0}title'
ref = '{http://cpe.mitre.org/dictionary/2.0}references'
cpe_item = '{http://scap.nist.gov/schema/cpe-extension/2.3}cpe23-item'
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
rows = []
i = 0
while i < len(xroot):
for elm in xroot[i]:
if elm.tag == title:
p_text = elm.text
#assign p_text
elif elm.tag == ref:
for nn in elm:
s = nn.text.lower()
#check the lower text in refs
if 'version' in s:
p_vers = nn.attrib.get('href')
#assign p_vers
elif 'advisor' in s:
p_advi = nn.attrib.get('href')
#assign p_advi
elif 'product' in s:
p_prod = nn.attrib.get('href')
#assign p_prod
elif 'vendor' in s:
p_vend = nn.attrib.get('href')
#assign p_vend
elif 'change' in s:
p_chan = nn.attrib.get('href')
#assign p_vend
else:
p_othe = nn.attrib.get('href')
elif elm.tag == cpe_item:
p_cpe = elm.attrib.get("name")
#assign p_cpe
else:
print(elm.tag)
row = [p_cpe, p_text, p_vend, p_prod, p_vers, p_chan, p_advi, p_othe]
rows.append(row)
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
print(len(rows)) #this shows how far I got during the running time
i+=1
out_df1 = pd.DataFrame(rows, columns = df_cols)# move this part outside the loop by removing the indent
print("---853k rows take %s seconds ---" % (time.time() - start_time))
updated: the faster way is to move the 2nd last row out side the loop. Since 'rows' already get info in each loop, there is no need to make a new dataframe every time.
the running time now is 136.0491042137146 seconds. yay!

Since your XML is fairly flat, consider the recently added IO module, pandas.read_xml introduced in v1.3. Given XML uses a default namespace, to reference elements in xpath use namespaces argument:
url = "https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip"
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}
)
If you do not have the default parser, lxml, installed, use the etree parser:
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}, parser="etree"
)

different return types for getpath() in lxml

I have folders full of XML files which I want to parse to a dataframe. The following functions iterate through an XML tree recursively and return a dataframe with three columns: path, attributes and text.
def XML2DF(filename,df1,MAX_DEPTH=20):
with open(filename) as f:
xml_str = f.read()
tree = etree.fromstring(xml_str)
df1 = recursive_parseXML2DF(tree, df1, MAX_DEPTH=MAX_DEPTH)
return
def recursive_parseXML2DF(element, df1, depth=0, MAX_DEPTH=20):
if depth > MAX_DEPTH:
return df1
df2 = pd.DataFrame([[element.getroottree().getpath(element), element.attrib, element.text]],
columns=["path", "attrib", "text"])
#print(df2)
df1 = pd.concat([df1, df2])
for child in element.getchildren():
df1 = recursive_parseXML2DF(child, df1, depth=depth + 1)
return df1
The code for the function was adapted from this post.
Most of the times the function works fine and returns the entire path but for some documents the returned path looks like this:
/*/*[1]/*[3]
/*/*[1]/*[3]/*[1]
The text tag entry remains valid and correct.
The only difference in the XML between working path and widlcard path documents I can make out is that the XML tags are written in all caps.
Working example:
<?xml version="1.0" encoding="utf-8"?>
<root>
<Header>
<ReceivingApplication>ReceivingApplication</ReceivingApplication>
<SendingApplication>SendingApplication</SendingApplication>
<MessageControlID>12345</MessageControlID>
<ReceivingApplication>ReceivingApplication</ReceivingApplication>
<FileCreationDate>2000-01-01T00:00:00</FileCreationDate>
</Header>
<Einsendung>
<Patient>
<PatientName>Name</PatientName>
<PatientVorname>FirstName</PatientVorname>
<PatientGebDat>2000-01-01T00:00:00</PatientGebDat>
<PatientSex>4</PatientSex>
<PatientPWID>123456</PatientPWID>
</Patient>
<Visit>
<VisitNumber>A2000.0001</VisitNumber>
<PatientPLZ>1234</PatientPLZ>
<PatientOrt>PatientOrt</PatientOrt>
<PatientAdr2>
</PatientAdr2>
<PatientStrasse>PatientStrasse 01</PatientStrasse>
<VisitEinsID>1234</VisitEinsID>
<VisitBefund>VisitBefund</VisitBefund>
<Befunddatum>2000-01-01T00:00:00</Befunddatum>
</Visit>
</Einsendung>
</root>
nonsensical Example:
<?xml version="1.0"?>
<KRSCHWEIZ xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="krSCHWEIZ">
<KEY_VS>abcdefg</KEY_VS>
<KEY_KLR>abcdefg</KEY_KLR>
<ABSENDER>
<ABSENDER_MELDER_ID>123456</ABSENDER_MELDER_ID>
<MELDER>
<MELDER_ID>123456</MELDER_ID>
<QUELLSYSTEM>ABCDEF</QUELLSYSTEM>
<PATIENT>
<REFERENZNR>987654</REFERENZNR>
<NACHNAME>my name</NACHNAME>
<VORNAMEN>my first name</VORNAMEN>
<GEBURTSNAME />
<GEBURTSDATUM>my dob</GEBURTSDATUM>
<GESCHLECHT>XX</GESCHLECHT>
<PLZ>9999</PLZ>
<WOHNORT>Mycity</WOHNORT>
<STRASSE>mystreet</STRASSE>
<HAUSNR>99</HAUSNR>
<VERSICHERTENNR>999999999</VERSICHERTENNR>
<DATEIEN>
<DATEI>
<DATEINAME>my_attached_document.html</DATEINAME>
<DATEIBASE64>mybase_64_encoded_document</DATEIBASE64>
</DATEI>
</DATEIEN>
</PATIENT>
</MELDER>
</ABSENDER>
</KRSCHWEIZ>
How do I get correct explicit path information also for this case?

The prescence of namespaces changes the output of .getpath() - you can use .getelementpath() instead which will include the namespace prefix instead of using wildcards.
If the prefix should be discarded completely - you can strip them out before using .getpath()
import lxml.etree
import pandas as pd
rows = []
tree = lxml.etree.parse("broken.xml")
for node in tree.iter():
try:
node.tag = lxml.etree.QName(node).localname
except ValueError:
# skip tags with no name
continue
rows.append([tree.getpath(node), node.attrib, node.text])
df = pd.DataFrame(rows, columns=["path", "attrib", "text"])
Resulting dataframe:
>>> df
path attrib text
0 /KRSCHWEIZ [] \n
1 /KRSCHWEIZ/KEY_VS [] abcdefg
2 /KRSCHWEIZ/KEY_KLR [] abcdefg
3 /KRSCHWEIZ/ABSENDER [] \n
4 /KRSCHWEIZ/ABSENDER/ABSENDER_MELDER_ID [] 123456
5 /KRSCHWEIZ/ABSENDER/MELDER [] \n
6 /KRSCHWEIZ/ABSENDER/MELDER/MELDER_ID [] 123456
7 /KRSCHWEIZ/ABSENDER/MELDER/QUELLSYSTEM [] ABCDEF
8 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT [] \n
9 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/REFERENZNR [] 987654
10 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/NACHNAME [] my name
11 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/VORNAMEN [] my first name
12 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/GEBURTSNAME [] None
13 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/GEBURTSDATUM [] my dob
14 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/GESCHLECHT [] XX
15 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/PLZ [] 9999
16 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/WOHNORT [] Mycity
17 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/STRASSE [] mystreet
18 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/HAUSNR [] 99
19 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/VERSICHERTENNR [] 999999999
20 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN [] \n
21 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN/DATEI [] \n
22 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN/DAT... [] my_attached_document.html
23 /KRSCHWEIZ/ABSENDER/MELDER/PATIENT/DATEIEN/DAT... [] mybase_64_encoded_document

Python XML Parser Issue

I am new to python. Sorry for asking this stupid question.
I am trying to read a XML file to python object (preferably to pandas)
For now I am just trying to print the variables, to see if I can read them properly in a tabular form.
I have used xml.etree.ElementTree for this, but I might not be using it as intended.
Code:
import xml.etree.ElementTree as ET
tree = ET.parse("data.xml")
ODM = tree.getroot()
ns = {'xmlns': 'http://www.cdisc.org/ns/odm/v1.3',
'mdsol': 'http://www.mdsol.com/ns/odm/metadata'}
for ClinicalData in ODM:
LocationOID=None
#print(ClinicalData.tag, ClinicalData.attrib)
for SubjectData in ClinicalData:
for SiteRef in SubjectData:
LocationOID=SiteRef.attrib.get('LocationOID')
for StudyEventData in SubjectData:
for AuditRecord in StudyEventData:
print(ClinicalData.attrib.get('MetaDataVersionOID'),
ClinicalData.attrib.get('AuditSubCategoryName'), #null ouptput due to namespace issue
SubjectData.attrib.get('SubjectKey'),
SubjectData.attrib.get('SubjectName'), #null ouptput due to namespace issue
LocationOID, #not sure what is the issue
StudyEventData.attrib.get('StudyEventRepeatKey'),
AuditRecord.find('DateTimeStamp') #not sure what is the issue
)
Input:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<ODM xmlns="http://www.cdisc.org/ns/odm/v1.3"
xmlns:mdsol="http://www.mdsol.com/ns/odm/metadata"
CreationDateTime="2019-08-23T12:59:09" FileOID="3b2b4161-fad8-4239-9c83-03d0e62624dd" FileType="Transactional" ODMVersion="1.3">
<ClinicalData MetaDataVersionOID="1772" StudyOID="0ACC SP3 MAPPING1(DEV)" mdsol:AuditSubCategoryName="Activated">
<SubjectData SubjectKey="7735fd9c-1792-457c-aa58-0ca26ecdc810" mdsol:SubjectKeyType="SubjectUUID" mdsol:SubjectName="ACC-SUBJ-3">
<SiteRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<StudyEventData StudyEventOID="FV" StudyEventRepeatKey="VIST[1]/FV[1]" mdsol:InstanceId="2960580">
<AuditRecord>
<UserRef UserOID="systemuser"/>
<LocationRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<DateTimeStamp>2019-07-10T07:56:54</DateTimeStamp>
<ReasonForChange>Update</ReasonForChange>
<SourceID>394263772</SourceID>
</AuditRecord>
</StudyEventData>
</SubjectData>
</ClinicalData>
</ODM>
I am expecting all the print variables need to have the proper variable assigned values as in XML file. Please let me know is there any other proper way of doing it instead of inner looping multiple times.

Namespaces are a pain using ElementTree. See this discussion.
Short answer:
for ClinicalData in ODM:
#print(ClinicalData.tag, ClinicalData.attrib)
for SubjectData in ClinicalData:
SiteRef = SubjectData.find('{http://www.cdisc.org/ns/odm/v1.3}SiteRef')
LocationOID = SiteRef.attrib.get('LocationOID')
for StudyEventData in SubjectData:
for AuditRecord in StudyEventData:
print(
ClinicalData.attrib.get('MetaDataVersionOID'),
ClinicalData.attrib.
get('{http://www.mdsol.com/ns/odm/metadata}AuditSubCategoryName'
), #null ouptput due to namespace issue
SubjectData.attrib.get('SubjectKey'),
SubjectData.attrib.get(
'{http://www.mdsol.com/ns/odm/metadata}SubjectName'
), #null ouptput due to namespace issue
LocationOID, #not sure what is the issue
StudyEventData.attrib.get('StudyEventRepeatKey'),
AuditRecord.find(
'{http://www.cdisc.org/ns/odm/v1.3}DateTimeStamp').
text #not sure what is the issue
)

I think you can use BeautifulSoup for parsing XML:
from bs4 import BeautifulSoup
temp ="""<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<ODM xmlns="http://www.cdisc.org/ns/odm/v1.3"
xmlns:mdsol="http://www.mdsol.com/ns/odm/metadata"
CreationDateTime="2019-08-23T12:59:09" FileOID="3b2b4161-fad8-4239-9c83-03d0e62624dd" FileType="Transactional" ODMVersion="1.3">
<ClinicalData MetaDataVersionOID="1772" StudyOID="0ACC SP3 MAPPING1(DEV)" mdsol:AuditSubCategoryName="Activated">
<SubjectData SubjectKey="7735fd9c-1792-457c-aa58-0ca26ecdc810" mdsol:SubjectKeyType="SubjectUUID" mdsol:SubjectName="ACC-SUBJ-3">
<SiteRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<StudyEventData StudyEventOID="FV" StudyEventRepeatKey="VIST[1]/FV[1]" mdsol:InstanceId="2960580">
<AuditRecord>
<UserRef UserOID="systemuser"/>
<LocationRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<DateTimeStamp>2019-07-10T07:56:54</DateTimeStamp>
<ReasonForChange>Update</ReasonForChange>
<SourceID>394263772</SourceID>
</AuditRecord>
</StudyEventData>
</SubjectData>
</ClinicalData>
</ODM>"""
temp=BeautifulSoup(temp,"lxml")
ClinicalData = temp.find('ClinicalData'.lower())
SubjectData = ClinicalData.find_all('SubjectData'.lower())
LocationOID=None
for i in SubjectData:
SiteRef = i.find('SiteRef'.lower())
LocationOID = SiteRef.attrs['locationoid']
print('LocationOID',LocationOID)
output:
LocationOID 0ACCSP3MAPPING1SITE1
[Finished in 1.2s]

#Justin
I have applied your suggestions, it worked, until I broke it.
Input:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<ODM xmlns="http://www.cdisc.org/ns/odm/v1.3" xmlns:mdsol="http://www.mdsol.com/ns/odm/metadata" CreationDateTime="2019-08-23T12:59:09" FileOID="3b2b4161-fad8-4239-9c83-03d0e62624dd" FileType="Transactional" ODMVersion="1.3">
<ClinicalData MetaDataVersionOID="2965" StudyOID="0ACC SP3 MAPPING1(DEV)" mdsol:AuditSubCategoryName="Entered">
<SubjectData SubjectKey="481e4653-693c-4e15-8762-d8a66c0d2cf1" mdsol:SubjectKeyType="SubjectUUID" mdsol:SubjectName="ACC-SUBJ-1">
<SiteRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<StudyEventData StudyEventOID="FV" StudyEventRepeatKey="VIST[1]/FV[1]" mdsol:InstanceId="2960564">
<FormData FormOID="VS" FormRepeatKey="1" mdsol:DataPageId="15331229">
<ItemGroupData ItemGroupOID="VS" mdsol:RecordId="17928808">
<ItemData ItemOID="VS.WT" TransactionType="Upsert" Value="45">
<AuditRecord>
<UserRef UserOID="alscrave2"/>
<LocationRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<DateTimeStamp>2018-02-02T09:39:30</DateTimeStamp>
<ReasonForChange/>
<SourceID>122841525</SourceID>
</AuditRecord>
<MeasurementUnitRef MeasurementUnitOID="1761.Weight.1"/>
</ItemData>
</ItemGroupData>
</FormData>
</StudyEventData>
</SubjectData>
</ClinicalData>
<ClinicalData MetaDataVersionOID="2965" StudyOID="0ACC SP3 MAPPING1(DEV)" mdsol:AuditSubCategoryName="Entered">
<SubjectData SubjectKey="481e4653-693c-4e15-8762-d8a66c0d2cf1" mdsol:SubjectKeyType="SubjectUUID" mdsol:SubjectName="ACC-SUBJ-1">
<SiteRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<StudyEventData StudyEventOID="FV" StudyEventRepeatKey="VIST[1]/FV[1]" mdsol:InstanceId="2960564">
<FormData FormOID="VS" FormRepeatKey="1" mdsol:DataPageId="15331229">
<ItemGroupData ItemGroupOID="VS" mdsol:RecordId="17928809">
<ItemData ItemOID="VS.WT" TransactionType="Upsert" Value="46">
<AuditRecord>
<UserRef UserOID="alscrave2"/>
<LocationRef LocationOID="0ACCSP3MAPPING1SITE1"/>
<DateTimeStamp>2018-02-02T09:39:30</DateTimeStamp>
<ReasonForChange/>
<SourceID>122841525</SourceID>
</AuditRecord>
<MeasurementUnitRef MeasurementUnitOID="1761.Weight.1"/>
</ItemData>
</ItemGroupData>
</FormData>
</StudyEventData>
</SubjectData>
</ClinicalData>
</ODM>
Code:
import xml.etree.ElementTree as ET
import pandas as pd
def getvalueofnode(node):
""" return node text or None """
return node.text if node is not None else None
tree = ET.parse("data.xml")
ODM = tree.getroot()
xmlns = "{http://www.cdisc.org/ns/odm/v1.3}"
mdsol = "{http://www.mdsol.com/ns/odm/metadata}"
def data_reader():
dfcols = ['CreationDateTime','StudyOID','MetaDataVersionOID','SubjectName','SUBJECTUUID','LocationOID','StudyEventOID',
'StudyEventRepeatKey','FormOID','FormRepeatKey','DataPageId','ItemgroupOID','RecordId','var_name','Value',
'DateTimeStamp','ASC_Name','Measurement_Unit','SourceID','UserOID','InstanceId']
df_xml = pd.DataFrame(columns=dfcols)
CreationDateTime = ODM.attrib.get('CreationDateTime')
for ClinicalData in ODM:
StudyOID = ClinicalData.attrib.get('StudyOID')
MetaDataVersionOID = ClinicalData.attrib.get('MetaDataVersionOID')
ASC_Name = ClinicalData.attrib.get('{0}AuditSubCategoryName'.format(mdsol))
for SubjectData in ClinicalData:
SubjectName = SubjectData.attrib.get('{0}SubjectName'.format(mdsol))
SUBJECTUUID = SubjectData.attrib.get('SubjectKey')
LocationOID = SubjectData.find('{0}SiteRef'.format(xmlns)).attrib.get('LocationOID')
for StudyEventData in SubjectData:
StudyEventOID = StudyEventData.attrib.get('StudyEventOID')
StudyEventRepeatKey = StudyEventData.attrib.get('StudyEventRepeatKey')
InstanceId = StudyEventData.attrib.get('{0}InstanceId'.format(mdsol))
for FormData in StudyEventData:
FormOID = FormData.attrib.get('FormOID')
FormRepeatKey = FormData.attrib.get('FormRepeatKey')
DataPageId = FormData.attrib.get('{0}DataPageId'.format(mdsol))
for ItemGroupData in FormData:
ItemgroupOID = ItemGroupData.attrib.get('ItemgroupOID')
RecordId = ItemGroupData.attrib.get('{0}RecordId'.format(mdsol))
for ItemData in ItemGroupData:
var_name = ItemData.attrib.get('ItemOID')
Value = ItemData.attrib.get('Value')
Measurement_Unit = ItemData.find('MeasurementUnitRef'.format(xmlns)).attrib.get('MeasurementUnitOID')
for AuditRecord in ItemData:
DateTimeStamp = AuditRecord.find('{0}DateTimeStamp'.format(xmlns)).text;
SourceID = AuditRecord.find('{0}SourceID'.format(xmlns)).text;
UserOID = ItemData.find('{0}UserRef'.format(xmlns)).attrib.get('UserOID')
df_xml = df_xml.append(
pd.Series([CreationDateTime,StudyOID,MetaDataVersionOID,SubjectName,
SUBJECTUUID,LocationOID,StudyEventOID,
StudyEventRepeatKey,FormOID,FormRepeatKey,DataPageId,ItemgroupOID,
RecordId,var_name,Value,DateTimeStamp,ASC_Name,Measurement_Unit,
SourceID,UserOID,InstanceId], index=dfcols),
ignore_index=True)
print(df_xml)
data_reader()
Issue: I am getting duplicate records. And variables DateTimeStamp, SourceID, UserOID and Measurement_Unit are throwing run time errors during assignment.

extract xml to pandas dataframe with unknown number of nodes

The below code sample works if there is only one node.
However, our use case we dont know how many nodes we will receive
Convert a xml to pandas data frame python
Sample as below.
How we can parse this into dataframe
In particular, we dont know how manby
we will received in the feed file
<?xml version = '1.0' encoding = 'UTF-8'?>
<EVENT spec="IDL:com/RfcCallEvents:1.0#Z_BAPI_UPDT_SERV_NOTIFICATION">
<eventHeader>
<objectName/>
<objectKey/>
<eventName/>
<eventId/>
</eventHeader>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>2</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>1</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_HEADER_DATA>
<QMNUM>030334920069</QMNUM>
<ZGSXREF>CONSUMER</ZGSXREF>
<ZVANTREF>G338005317</ZVANTREF>
<ZSHIPER/>
<ZSHPRNO/>
<ZRVREF/>
<ZTECHID>4HQ2OD6C19</ZTECHID>
<ZADREPAIR/>
<ZZKATR7/>
</TAB_HEADER_DATA>
</EVENT>

I suspect you need to parse xml-data to several dataframes, e.g. as follows:
import xmltodict # install this module first
data = """<?xml version = '1.0' encoding = 'UTF-8'?>
<EVENT spec="IDL:com/RfcCallEvents:1.0#Z_BAPI_UPDT_SERV_NOTIFICATION">
<eventHeader>
<objectName/>
<objectKey/>
<eventName/>
<eventId/>
</eventHeader>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>2</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_DETAIL_DATA>
<ZNEWFLAG>X</ZNEWFLAG>
<FENUM>1</FENUM>
<BAUTL>661-01727</BAUTL>
<OTEIL/>
<FECOD>KBB</FECOD>
<URCOD>B08</URCOD>
<ZCOMPMDF>A</ZCOMPMDF>
<ZOPREPL/>
<ZWRNCOV>LP</ZWRNCOV>
<ZWRNREF/>
<ZNEWPS>C07XMAAEJCLD</ZNEWPS>
<ZOLDPN/>
<ZOLDPD/>
<ZOLDPS>C07XMAACJCLD</ZOLDPS>
<MAILINFECOD/>
<ZUNITPR/>
<ZNEWPD/>
<ZNEWPN/>
<ZABUSE/>
<ZRPS>S</ZRPS>
<ZEXKGB/>
<ZKGBMM/>
<ZINSTS>000</ZINSTS>
<ZACKBB/>
<ZCHKOVR/>
<ZSNDB/>
<ZNOTAFISCAL/>
<ZCONSGMT/>
<ZPRTCONS/>
<ZZRTNTRNO/>
<ZZRTNCAR/>
<ZZINSPECT/>
<ZZPR_OPT/>
</TAB_DETAIL_DATA>
<TAB_HEADER_DATA>
<QMNUM>030334920069</QMNUM>
<ZGSXREF>CONSUMER</ZGSXREF>
<ZVANTREF>G338005317</ZVANTREF>
<ZSHIPER/>
<ZSHPRNO/>
<ZRVREF/>
<ZTECHID>4HQ2OD6C19</ZTECHID>
<ZADREPAIR/>
<ZZKATR7/>
</TAB_HEADER_DATA>
</EVENT>"""
dct = xmltodict.parse(data)
def make_df(name="TAB_DETAIL_DATA", dct=dct):
df = pd.DataFrame()
if isinstance(dct['EVENT'][name], list):
for j in dct['EVENT'][name]:
_ = pd.DataFrame({'value': [y for x, y in j.items()]}, index=j.keys())
df = pd.concat([df, _])
else:
df = pd.DataFrame({'value': [y for x, y in dct['EVENT'][name].items()]}, index=dct['EVENT'][name].keys())
return df
Now, you can experiment with the parser:
make_df(name="TAB_HEADER_DATA") # produces single df
make_df(name="TAB_DETAIL_DATA") # concatenates all content occurred in TAB_DETAIL_DATA sections, returns single df

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to convert XML data as a pandas data frame? - python

Related

XML into Pandas dataframe

What is the best way to parse large XML and genarate a dataframe with the data in the XML (with python or else)?

different return types for getpath() in lxml

Python XML Parser Issue

extract xml to pandas dataframe with unknown number of nodes

Categories

Resources