Parse xml w/ xsd to CSV with Python?

Parse xml w/ xsd to CSV with Python? - python

I am trying to parse a very large XML file which I downloaded from OSHA's website and convert it into a CSV so I can use it in a SQLite database along with some other spreadsheets. I would just use an online converter, but the osha file is apparently too big for all of them.
I wrote a script in Python which looks like this:
import csv
import xml.etree.cElementTree as ET
tree = ET.parse('data.xml')
root = tree.getroot()
xml_data_to_csv =open('Out.csv', 'w')
list_head=[]
Csv_writer=csv.writer(xml_data_to_csv)
count=0
for element in root.findall('data'):
List_nodes =[]
if count== 0:
inspection_number = element.find('inspection_number').tag
list_head.append(inspection_number)
establishment_name = element.find('establishment_name').tag
list_head.append(establishment_name)
city = element.find('city')
list_head.append(city)
state = element.find('state')
list_head.append(state)
zip_code = element.find('zip_code')
list_head.append(zip_code)
sic_code = element.find('sic_code')
list_head.append(sic_code)
naics_code = element.find('naics_code')
list_head.append(naics_code)
sampling_number = element.find('sampling_number')
list_head.append(sampling_number)
office_id = element.find('office_id')
list_head.append(office_id)
date_sampled = element.find('date_sampled')
list_head.append(date_sampled)
date_reported = element.find('date_reported')
list_head.append(date_reported)
eight_hour_twa_calc = element.find('eight_hour_twa_calc')
list_head.append(eight_hour_twa_calc)
instrument_type = element.find('instrument_type')
list_head.append(instrument_type)
lab_number = element.find('lab_number')
list_head.append(lab_number)
field_number = element.find('field_number')
list_head.append(field_number)
sample_type = element.find('sample_type')
list_head.append(sample_type)
blank_used = element.find('blank_used')
list_head.append(blank_used)
time_sampled = element.find('time_sampled')
list_head.append(time_sampled)
air_volume_sampled = element.find('air_volume_sampled')
list_head.append(air_volume_sampled)
sample_weight = element.find('sample_weight')
list_head.append(sample_weight)
imis_substance_code = element.find('imis_substance_code')
list_head.append(imis_substance_code)
substance = element.find('substance')
list_head.append(substance)
sample_result = element.find('sample_result')
list_head.append(sample_result)
unit_of_measurement = element.find('unit_of_measurement')
list_head.append(unit_of_measurement)
qualifier = element.find('qualifier')
list_head.append(qualifier)
Csv_writer.writerow(list_head)
count = +1
inspection_number = element.find('inspection_number').text
List_nodes.append(inspection_number)
establishment_name = element.find('establishment_name').text
List_nodes.append(establishment_name)
city = element.find('city').text
List_nodes.append(city)
state = element.find('state').text
List_nodes.append(state)
zip_code = element.find('zip_code').text
List_nodes.append(zip_code)
sic_code = element.find('sic_code').text
List_nodes.append(sic_code)
naics_code = element.find('naics_code').text
List_nodes.append(naics_code)
sampling_number = element.find('sampling_number').text
List_nodes.append(sampling_number)
office_id = element.find('office_id').text
List_nodes.append(office_id)
date_sampled = element.find('date_sampled').text
List_nodes.append(date_sampled)
date_reported = element.find('date_reported').text
List_nodes.append(date_reported)
eight_hour_twa_calc = element.find('eight_hour_twa_calc').text
List_nodes.append(eight_hour_twa_calc)
instrument_type = element.find('instrument_type').text
List_nodes.append(instrument_type)
lab_number = element.find('lab_number').text
List_nodes.append(lab_number)
field_number = element.find('field_number').text
List_nodes.append(field_number)
sample_type = element.find('sample_type').text
List_nodes.append(sample_type)
blank_used = element.find('blank_used').text
List_nodes.append()
time_sampled = element.find('time_sampled').text
List_nodes.append(time_sampled)
air_volume_sampled = element.find('air_volume_sampled').text
List_nodes.append(air_volume_sampled)
sample_weight = element.find('sample_weight').text
List_nodes.append(sample_weight)
imis_substance_code = element.find('imis_substance_code').text
List_nodes.append(imis_substance_code)
substance = element.find('substance').text
List_nodes.append(substance)
sample_result = element.find('sample_result').text
List_nodes.append(sample_result)
unit_of_measurement = element.find('unit_of_measurement').text
List_nodes.append(unit_of_measurement)
qualifier= element.find('qualifier').text
List_nodes.append(qualifier)
Csv_writer.writerow(List_nodes)
xml_data_to_csv.close()
But when I run the code I get a CSV with nothing in it. I suspect this may have something to do with the XSD file associated with the XML, but I'm not totally sure.
Does anyone know what the issue is here?

The code below is a 'compact' version of your code.
It assumes that the XML structure looks like in the script variable xml. (Based on https://www.osha.gov/opengov/sample_data_2011.zip)
The main difference bwtween this sample code and yours is that I define the fields that I want to collect once (see FIELDS) and I use this definition across the script.
import xml.etree.ElementTree as ET
FIELDS = ['lab_number', 'instrument_type'] # TODO add more fields
xml = '''<main xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="health_sample_data.xsd">
<DATA_RECORD>
<inspection_number>316180165</inspection_number>
<establishment_name>PROFESSIONAL ENGINEERING SERVICES, LLC.</establishment_name>
<city>EUFAULA</city>
<state>AL</state>
<zip_code>36027</zip_code>
<sic_code>1799</sic_code>
<naics_code>238990</naics_code>
<sampling_number>434866166</sampling_number>
<office_id>418600</office_id>
<date_sampled>2011-12-30</date_sampled>
<date_reported>2011-12-30</date_reported>
<eight_hour_twa_calc>N</eight_hour_twa_calc>
<instrument_type>TBD</instrument_type>
<lab_number>L13645</lab_number>
<field_number>S1</field_number>
<sample_type>B</sample_type>
<blank_used>N</blank_used>
<time_sampled></time_sampled>
<air_volume_sampled></air_volume_sampled>
<sample_weight></sample_weight>
<imis_substance_code>S777</imis_substance_code>
<substance>Soil</substance>
<sample_result>0</sample_result>
<unit_of_measurement>AAAAA</unit_of_measurement>
<qualifier></qualifier>
</DATA_RECORD>
<DATA_RECORD>
<inspection_number>315516757</inspection_number>
<establishment_name>MARGUERITE CONCRETE CO.</establishment_name>
<city>WORCESTER</city>
<state>MA</state>
<zip_code>1608</zip_code>
<sic_code>1771</sic_code>
<naics_code>238110</naics_code>
<sampling_number>423259902</sampling_number>
<office_id>112600</office_id>
<date_sampled>2011-12-30</date_sampled>
<date_reported>2011-12-30</date_reported>
<eight_hour_twa_calc>N</eight_hour_twa_calc>
<instrument_type>GRAV</instrument_type>
<lab_number>L13355</lab_number>
<field_number>9831B</field_number>
<sample_type>P</sample_type>
<blank_used>N</blank_used>
<time_sampled>184</time_sampled>
<air_volume_sampled>340.4</air_volume_sampled>
<sample_weight>.06</sample_weight>
<imis_substance_code>9135</imis_substance_code>
<substance>Particulates not otherwise regulated (Total Dust)</substance>
<sample_result>0.176</sample_result>
<unit_of_measurement>M</unit_of_measurement>
<qualifier></qualifier>
</DATA_RECORD></main>'''
root = ET.fromstring(xml)
records = root.findall('.//DATA_RECORD')
with open('out.csv', 'w') as out:
out.write(','.join(FIELDS) + '\n')
for record in records:
values = [record.find(f).text for f in FIELDS]
out.write(','.join(values) + '\n')
out.csv
lab_number,instrument_type
L13645,TBD
L13355,GRAV

Related

'int' object is not subscriptable - Python using Pandas

Im working with nested JSON data with Pandas, but i have a problem once i extract the dataframe of the nested data.
The data looks like:
[{"export_id":"COL-EXP-1894","origin_office":"EXAMPLE","destination_office":"","incoterms":"","shipment_date":"","export_date":"2023-01-01","origin_port":"Buenaventura","destination_port":"New York/New Jersey","bl_number":null,"shipping_line":null,"shipping_mode":null,"vessel_name":null,"voyage_number":null,"reservation_number":null,"container_number":null,"seal_number":null,"eta":null,"etd":null,"export_status":"in_progress","ico_list":\[\]}\]
And reading like that all is good, but some data have ico_list like:
[{"export_id":"COL-EXP-1894","origin_office":"EXAMPLE","destination_office":"","incoterms":"","shipment_date":"","export_date":"2023-01-01","origin_port":"Buenaventura","destination_port":"New York/New Jersey","bl_number":null,"shipping_line":null,"shipping_mode":null,"vessel_name":null,"voyage_number":null,"reservation_number":null,"container_number":null,"seal_number":null,"eta":null,"etd":null,"export_status":"in_progress","ico_list":[{"ico_id":"03-0178-436-23","contract_id":"CI-1046","customer":null,"origin_office":"example","destination_office":"example","incoterm":"CIF","quality":"ML","mark":"example","packaging_type":"Nitrogen-Flushed Vac-Packed Boxes - 35KG","packaging_capacity":35.0,"units":1,"quantity":35.0,"certification":null}]}]
And not just one like the example, can be more, so i implemented this:
if response.status_code == 200:
data_str = response.text
try:
atlas_api_data = json.loads(data_str)
df_atlas = pd.json_normalize(atlas_api_data)
#print(df_atlas)
except:
print('ErrorOccured While Parsing JSON ATLAS API TO Dataframe')
df_atlas2 = pd.json_normalize(df_atlas['ico_list'].loc[95])
for i, row in df_atlas.iterrows():
export_id = row['export_id']
origin_office = row['origin_office']
destination_office = row['destination_office']
export_date = row['export_date']
origin_port = row['origin_port']
destination_port = row['destination_port']
bl_number = row['bl_number']
shipping_line = row['shipping_line']
shipping_mode = row['shipping_mode']
vessel_name = row['vessel_name']
voyage_number = row['voyage_number']
reservation_number = row['reservation_number']
container_number = row['container_number']
seal_number = row['seal_number']
export_status = row['export_status']
values = [export_id,origin_office,destination_office,export_date,origin_port,destination_port,
bl_number,shipping_line,shipping_mode,vessel_name,voyage_number,reservation_number,container_number,
seal_number,export_status]
data_list.append(values)
df_atlas2 = pd.json_normalize(df_atlas['ico_list'].loc[i])
if df_atlas2.empty:
print('Empty DF')
else:
for row_ico, j in df_atlas2.iterrows():
ico_id = row_ico['ico_id']
contract_id = row_ico['contract_id']
customer = row_ico['customer']
incoterm = row_ico['incoterm']
quality = row_ico['quality']
mark = row_ico['mark']
packaging_type = row_ico['packaging_type']
packaging_capacity = row_ico['packaging_capacity']
units = row_ico['units']
quantity = row_ico['quantity']
certification = row_ico['certification']
ico_values = [export_id,ico_id,contract_id,customer,incoterm,quality,mark,packaging_type,packaging_capacity,units,quantity,certification]
data_ico_list.append(ico_values)
In this way i extract only the data that i need, and for the first level worked, but when i go to the second iterrows() it says
TypeError Traceback (most recent call last)
Cell In [4], line 43
41 else:
42 for row_ico, j in df_atlas2.iterrows():
---> 43 ico_id = row_ico['ico_id']
44 contract_id = row_ico['contract_id']
45 customer = row_ico['customer']
TypeError: 'int' object is not subscriptable
When printing the df_atlas2 it looks normal, like this:
variable: df_atlas2 before goes into iterrrows()
I tried using df_atlas2['ico_id'].astype(str) with all the columns and ico_id = str(row_ico['ico_id']) but still getting the message
If you know how to solve this, hundred thanks!

What is the best way to parse large XML and genarate a dataframe with the data in the XML (with python or else)?

I try to make a table (or csv, I'm using pandas dataframe) from the information of an XML file.
The file is here (.zip is 14 MB, XML is ~370MB), https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip . It has package information of different languages - node.js, python, java etc. aka, CPE 2.3 list by the US government org NVD.
this is how it looks like in the first 30 rows:
<cpe-list xmlns:config="http://scap.nist.gov/schema/configuration/0.1" xmlns="http://cpe.mitre.org/dictionary/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.3" xmlns:cpe-23="http://scap.nist.gov/schema/cpe-extension/2.3" xmlns:ns6="http://scap.nist.gov/schema/scap-core/0.1" xmlns:meta="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" xsi:schemaLocation="http://scap.nist.gov/schema/cpe-extension/2.3 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary-extension_2.3.xsd http://cpe.mitre.org/dictionary/2.0 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary_2.3.xsd http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2 https://scap.nist.gov/schema/cpe/2.1/cpe-dictionary-metadata_0.2.xsd http://scap.nist.gov/schema/scap-core/0.3 https://scap.nist.gov/schema/nvd/scap-core_0.3.xsd http://scap.nist.gov/schema/configuration/0.1 https://scap.nist.gov/schema/nvd/configuration_0.1.xsd http://scap.nist.gov/schema/scap-core/0.1 https://scap.nist.gov/schema/nvd/scap-core_0.1.xsd">
<generator>
<product_name>National Vulnerability Database (NVD)</product_name>
<product_version>4.9</product_version>
<schema_version>2.3</schema_version>
<timestamp>2022-03-17T03:51:01.909Z</timestamp>
</generator>
<cpe-item name="cpe:/a:%240.99_kindle_books_project:%240.99_kindle_books:6::~~~android~~">
<title xml:lang="en-US">$0.99 Kindle Books project $0.99 Kindle Books (aka com.kindle.books.for99) for android 6.0</title>
<references>
<reference href="https://play.google.com/store/apps/details?id=com.kindle.books.for99">Product information</reference>
<reference href="https://docs.google.com/spreadsheets/d/1t5GXwjw82SyunALVJb2w0zi3FoLRIkfGPc7AMjRF0r4/edit?pli=1#gid=1053404143">Government Advisory</reference>
</references>
<cpe-23:cpe23-item name="cpe:2.3:a:\$0.99_kindle_books_project:\$0.99_kindle_books:6:*:*:*:*:android:*:*"/>
</cpe-item>
The tree structure of the XML file is quite simple, the root is 'cpe-list', the child element is 'cpe-item', and the grandchild elements are 'title', 'references' and 'cpe23-item'.
From 'title', I want the text in the element;
From 'cpe23-item', I want the attribute 'name';
From 'references', I want the attributes 'href' from its great-grandchildren, 'reference'.
The dataframe should look like this:
| cpe23_name | title_text | ref1 | ref2 | ref3 | ref_other
0 | 'cpe23name 1'| 'this is a python pkg'| 'url1'| 'url2'| NaN | NaN
1 | 'cpe23name 2'| 'this is a java pkg' | 'url1'| 'url2'| NaN | NaN
...
my code is here，finished in ~100sec：
import xml.etree.ElementTree as et
xtree = et.parse("official-cpe-dictionary_v2.3.xml")
xroot = xtree.getroot()
import time
start_time = time.time()
df_cols = ["cpe", "text", "vendor", "product", "version", "changelog", "advisory", 'others']
title = '{http://cpe.mitre.org/dictionary/2.0}title'
ref = '{http://cpe.mitre.org/dictionary/2.0}references'
cpe_item = '{http://scap.nist.gov/schema/cpe-extension/2.3}cpe23-item'
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
rows = []
i = 0
while i < len(xroot):
for elm in xroot[i]:
if elm.tag == title:
p_text = elm.text
#assign p_text
elif elm.tag == ref:
for nn in elm:
s = nn.text.lower()
#check the lower text in refs
if 'version' in s:
p_vers = nn.attrib.get('href')
#assign p_vers
elif 'advisor' in s:
p_advi = nn.attrib.get('href')
#assign p_advi
elif 'product' in s:
p_prod = nn.attrib.get('href')
#assign p_prod
elif 'vendor' in s:
p_vend = nn.attrib.get('href')
#assign p_vend
elif 'change' in s:
p_chan = nn.attrib.get('href')
#assign p_vend
else:
p_othe = nn.attrib.get('href')
elif elm.tag == cpe_item:
p_cpe = elm.attrib.get("name")
#assign p_cpe
else:
print(elm.tag)
row = [p_cpe, p_text, p_vend, p_prod, p_vers, p_chan, p_advi, p_othe]
rows.append(row)
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
print(len(rows)) #this shows how far I got during the running time
i+=1
out_df1 = pd.DataFrame(rows, columns = df_cols)# move this part outside the loop by removing the indent
print("---853k rows take %s seconds ---" % (time.time() - start_time))
updated: the faster way is to move the 2nd last row out side the loop. Since 'rows' already get info in each loop, there is no need to make a new dataframe every time.
the running time now is 136.0491042137146 seconds. yay!

Since your XML is fairly flat, consider the recently added IO module, pandas.read_xml introduced in v1.3. Given XML uses a default namespace, to reference elements in xpath use namespaces argument:
url = "https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip"
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}
)
If you do not have the default parser, lxml, installed, use the etree parser:
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}, parser="etree"
)

parse xml to pandas data frame in python

I am trying to read the XML file and convert it to pandas. However it returns empty data
This is the sample of xml structure:
<Instance ID="1">
<MetaInfo StudentID ="DTSU040" TaskID="LP03_PR09.bLK.sh" DataSource="DeepTutorSummer2014"/>
<ProblemDescription>A car windshield collides with a mosquito, squashing it.</ProblemDescription>
<Question>How does this work tion?</Question>
<Answer>tthis is my best </Answer>
<Annotation Label="correct(0)|correct_but_incomplete(1)|contradictory(0)|incorrect(0)">
<AdditionalAnnotation ContextRequired="0" ExtraInfoInAnswer="0"/>
<Comments Watch="1"> The student forgot to tell the opposite force. Opposite means opposite direction, which is important here. However, one can argue that the opposite is implied. See the reference answers.</Comments>
</Annotation>
<ReferenceAnswers>
1: Since the windshield exerts a force on the mosquito, which we can call action, the mosquito exerts an equal and opposite force on the windshield, called the reaction.
</ReferenceAnswers>
</Instance>
I have tried this code, however it's not working on my side. It returns empty dataframe.
import pandas as pd
import xml.etree.ElementTree as et
xtree = et.parse("grade_data.xml")
xroot = xtree.getroot()
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", 'Question', 'Answer',
'ContextRequired', 'ExtraInfoInAnswer', 'Comments', 'Watch', 'ReferenceAnswers']
rows = []
for node in xroot:
s_name = node.attrib.get("ID")
s_student = node.find("StudentID")
s_task = node.find("TaskID")
s_source = node.find("DataSource")
s_desc = node.find("ProblemDescription")
s_question = node.find("Question")
s_ans = node.find("Answer")
s_label = node.find("Label")
s_contextrequired = node.find("ContextRequired")
s_extraInfoinAnswer = node.find("ExtraInfoInAnswer")
s_comments = node.find("Comments")
s_watch = node.find("Watch")
s_referenceAnswers = node.find("ReferenceAnswers")
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers,
})
out_df = pd.DataFrame(rows, columns = df_cols)

The problem in your solution was that the "element data extraction" was not done properly. The xml you mentioned in the question is nested in several layers. And that is why we need to recursively read and extract the data. The following solution should give you what you need in this case. Although I would encourage you to look at this article and the python documentation for more clarity.
Method: 1
import numpy as np
import pandas as pd
#import os
import xml.etree.ElementTree as ET
def xml2df(xml_source, df_cols, source_is_file = False, show_progress=True):
"""Parse the input XML source and store the result in a pandas
DataFrame with the given columns.
For xml_source = xml_file, Set: source_is_file = True
For xml_source = xml_string, Set: source_is_file = False
<element attribute_key1=attribute_value1, attribute_key2=attribute_value2>
<child1>Child 1 Text</child1>
<child2>Child 2 Text</child2>
<child3>Child 3 Text</child3>
</element>
Note that for an xml structure as shown above, the attribute information of
element tag can be accessed by list(element). Any text associated with <element> tag can be accessed
as element.text and the name of the tag itself can be accessed with
element.tag.
"""
if source_is_file:
xtree = ET.parse(xml_source) # xml_source = xml_file
xroot = xtree.getroot()
else:
xroot = ET.fromstring(xml_source) # xml_source = xml_string
consolidator_dict = dict()
default_instance_dict = {label: None for label in df_cols}
def get_children_info(children, instance_dict):
# We avoid using element.getchildren() as it is deprecated.
# Instead use list(element) to get a list of attributes.
for child in children:
#print(child)
#print(child.tag)
#print(child.items())
#print(child.getchildren()) # deprecated method
#print(list(child))
if len(list(child))>0:
instance_dict = get_children_info(list(child),
instance_dict)
if len(list(child.keys()))>0:
items = child.items()
instance_dict.update({key: value for (key, value) in items})
#print(child.keys())
instance_dict.update({child.tag: child.text})
return instance_dict
# Loop over all instances
for instance in list(xroot):
instance_dict = default_instance_dict.copy()
ikey, ivalue = instance.items()[0] # The first attribute is "ID"
instance_dict.update({ikey: ivalue})
if show_progress:
print('{}: {}={}'.format(instance.tag, ikey, ivalue))
# Loop inside every instance
instance_dict = get_children_info(list(instance),
instance_dict)
#consolidator_dict.update({ivalue: instance_dict.copy()})
consolidator_dict[ivalue] = instance_dict.copy()
df = pd.DataFrame(consolidator_dict).T
df = df[df_cols]
return df
Run the following to generate the desired output.
xml_source = r'grade_data.xml'
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", "Question", "Answer",
"ContextRequired", "ExtraInfoInAnswer", "Comments", "Watch", 'ReferenceAnswers']
df = xml2df(xml_source, df_cols, source_is_file = True)
df
Method: 2
Given you have the xml_string, you could convert xml >> dict >> dataframe. run the following to get the desired output.
Note: You will need to install xmltodict to use Method-2. This method is inspired by the solution suggested by #martin-blech at How to convert XML to JSON in Python? [duplicate]
. Kudos to #martin-blech for making it.
pip install -U xmltodict
Solution
def read_recursively(x, instance_dict):
#print(x)
txt = ''
for key in x.keys():
k = key.replace("#","")
if k in df_cols:
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
#else:
instance_dict.update({k: x.get(key)})
#print('{}: {}'.format(k, x.get(key)))
else:
#print('else: {}: {}'.format(k, x.get(key)))
# dig deeper if value is another dict
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
# add simple text associated with element
if k=='#text':
txt = x.get(key)
# update text to corresponding parent element
if (k!='#text') and (txt!=''):
instance_dict.update({k: txt})
return (instance_dict, txt)
You will need the function read_recursively() given above. Now run the following.
import xmltodict, json
o = xmltodict.parse(xml_string) # INPUT: XML_STRING
#print(json.dumps(o)) # uncomment to see xml to json converted string
consolidated_dict = dict()
oi = o['Instances']['Instance']
for x in oi:
instance_dict = dict()
instance_dict, _ = read_recursively(x, instance_dict)
consolidated_dict.update({x.get("#ID"): instance_dict.copy()})
df = pd.DataFrame(consolidated_dict).T
df = df[df_cols]
df

Several issues:
Calling .find on the loop variable, node, expects a child node to exist: current_node.find('child_of_current_node'). However, since all the nodes are the children of root they do not maintain their own children, so no loop is required;
Not checking NoneType that can result from missing nodes with find() and prevents retrieving .tag or .text or other attributes;
Not retrieving node content with .text, otherwise the <Element... object is returned;
Consider this adjustment using the ternary condition expression a if condition else b to ensure variable has a value regardless:
rows = []
s_name = xroot.attrib.get("ID")
s_student = xroot.find("StudentID").text if xroot.find("StudentID") is not None else None
s_task = xroot.find("TaskID").text if xroot.find("TaskID") is not None else None
s_source = xroot.find("DataSource").text if xroot.find("DataSource") is not None else None
s_desc = xroot.find("ProblemDescription").text if xroot.find("ProblemDescription") is not None else None
s_question = xroot.find("Question").text if xroot.find("Question") is not None else None
s_ans = xroot.find("Answer").text if xroot.find("Answer") is not None else None
s_label = xroot.find("Label").text if xroot.find("Label") is not None else None
s_contextrequired = xroot.find("ContextRequired").text if xroot.find("ContextRequired") is not None else None
s_extraInfoinAnswer = xroot.find("ExtraInfoInAnswer").text if xroot.find("ExtraInfoInAnswer") is not None else None
s_comments = xroot.find("Comments").text if xroot.find("Comments") is not None else None
s_watch = xroot.find("Watch").text if xroot.find("Watch") is not None else None
s_referenceAnswers = xroot.find("ReferenceAnswers").text if xroot.find("ReferenceAnswers") is not None else None
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers
})
out_df = pd.DataFrame(rows, columns = df_cols)
Alternatively, run a more dynamic version assigning to an inner dictionary using the iterator variable:
rows = []
for node in xroot:
inner = {}
inner[node.tag] = node.text
rows.append(inner)
out_df = pd.DataFrame(rows, columns = df_cols)
Or list/dict comprehension:
rows = [{node.tag: node.text} for node in xroot]
out_df = pd.DataFrame(rows, columns = df_cols)

Trying to avoid creating numerous variables in python

I am new to python and I have a lot of variables I will be using in this script. These variables are being used to grab data from each column in an uploaded file. I have added variables for each object type and I have about 12 more object types to add. Isn't there a better way I can do this? I have the file it's grabbing data from here:
Action Object Solution ID hostgroup_name alias
Add Host Group ISD-CR ISD-CR_database ISD-CR Database
Add Service ISD-CR ISD-CR_database
Update Service Group ISD-CR ISD-CR Database
Delete Service ISD-CR ISD-CR_database
Here is the script I have so far.
from pynag import Model
from pynag.Parsers import config
def addObject():
# Add hostgroup object
hg = Model.Hostgroup()
hg.set_filename('/etc/nagios/objects/solution1/{0}.cfg'.format(target_hostgroup_name))
# Adding all attributes to allow any to be added if needed
hg.hostgroup_name = target_hostgroup_name
hg.alias = target_alias
hg.members = target_members
hg.hostgroup_members = target_hostgroup_members
hg.notes = target_notes
hg.notes_url = target_notes_url
hg.action_url = target_action_url
# Save
hg.save()
print "hostgroup added"
# Add service object
s = Model.Service()
s.set_filename('/etc/nagios/objects/solution1/{0}.cfg'.format(target_hostgroup_name))
# Adding all attributes to allow any to be added if needed
s.host_name = target_host_name
s.hostgroup_name = target_hostgroup_name
s.service_description = target_service_description
s.display_name = target_display_name
s.servicegroups = target_servicegroups
s.is_volatile = target_is_volatile
s.check_command = target_check_command
s.initial_state = target_initial_state
s.max_check_attempts = target_max_check_attempts
s.check_interval = target_check_interval
s.retry_interval = target_retry_interval
s.active_checks_enabled = target_active_checks_enabled
s.passive_checks_enabled = target_passive_checks_enabled
s.check_period = target_check_period
s.obsess_over_service = target_obsess_over_service
s.check_freshness = target_check_freshness
s.freshness_threshold = target_freshness_threshold
s.event_handler = target_event_handler
s.event_handler_enabled = target_event_handler_enabled
s.low_flap_threshold = target_low_flap_threshold
s.high_flap_threshold = target_high_flap_threshold
s.flap_detection_enabled = target_flap_detection_enabled
s.flap_detection_options = target_flap_detection_options
s.process_perf_data = target_process_perf_data
s.retain_status_information = target_retain_status_information
s.retain_nonstatus_information = target_retain_nonstatus_information
s.notification_interval = target_notification_interval
s.first_notification_delay = target_first_notification_delay
s.notification_period = target_notification_period
s.notification_options = target_notification_options
s.notification_enabled = target_notifications_enabled
s.contacts = target_contacts
s.contact_groups = target_contact_groups
s.stalking_options = target_stalking_options
s.notes = target_notes
s.notes_url = target_notes_url
s.action_url = target_action_url
s.icon_image = target_icon_image
s.icon_image_alt = target_icon_image_alt
# Save
s.save()
print "service added"
# Add servicegroup object
sg = Model.Servicegroup()
sg.set_filename('/etc/nagios/objects/solution1/{0}.cfg'.format(target_hostgroup_name))
# Adding all attributes to allow any to be added if needed
sg.servicegroup_name = target_servicegroup_name
sg.alias = target_alias
sg.members = target_members
sg.servicegroup_members = target_servicegroup_members
sg.notes = target_notes
sg.notes_url = target_notes_url
sg.action_url = '/etc/nagios/objects/solution1/{0}.cfg'.format(target_hostgroup_name)
# Save
sg.save()
print "service group added"
try:
current_file = csv.reader(open(input_file, "rb"), delimiter='\t')
except:
logging.error('No such file or directory. Please try again')
else:
for line in current_file:
for row in current_file:
target_hostgroup_name = row[3]
target_alias = row[4]
target_members = row[5]
target_hostgroup_members = row[6]
target_notes = row[7]
target_notes_url = row[8]
target_action_url = row[9]
target_host_name = row[10]
target_service_description = row[11]
target_display_name = row[12]
target_servicegroups = row[13]
target_is_volatile = row[14]
target_check_command = row[15]
target_initial_state = row[16]
target_max_check_attempts = row[17]
target_check_interval = row[18]
target_retry_interval = row[19]
target_active_checks_enabled = row[20]
target_passive_checks_enabled = row[21]
target_check_period = row[22]
target_obsess_over_service = row[23]
target_check_freshness = row[24]
target_freshness_threshold = row[25]
target_event_handler = row[26]
target_event_handler_enabled = row[27]
target_low_flap_threshold = row[28]
target_high_flap_threshold = row[29]
target_flap_detection_enabled = row[30]
target_flap_detection_options = row[31]
target_process_perf_data = row[32]
target_retain_status_information = row[33]
target_retain_nonstatus_information = row[34]
target_notification_interval = row[35]
target_first_notification_delay = row[36]
target_notification_period = row[37]
target_notification_options = row[38]
target_notifications_enabled = row[39]
target_contacts = row[40]
target_contact_groups = row[41]
target_stalking_options = row[42]
target_icon_image = row[43]
target_icon_image_alt = row[44]
target_servicegroup_name = row[45]
target_servicegroup_members = row[46]

If the values are in the same order every time, you could consider populating a list that you then could loop over, instead of doing it one by one.
For the "target" portion of your script, you could nest another loop for range(3, 46) as well, and pass the index to your list instead of manually for every number from 3 to 46.

Why do you do this?
for line in current_file:
for row in current_file:
If the first row is a header row and you're skipping it on purpose, you can use a DictReader instead.
It doesn't look like you'll be able to do much to clean this up, but you could factor out each "section" into its own function:
def save_hostgroup(name, alias, members, hostgroup_members, notes, notes_url, action_url):
hg = Model.Hostgroup()
hg.set_filename('/etc/nagios/objects/solution1/{0}.cfg'.format(target_hostgroup_name))
# Adding all attributes to allow any to be added if needed
hg.hostgroup_name = target_hostgroup_name
hg.alias = target_alias
hg.members = target_members
hg.hostgroup_members = target_hostgroup_members
hg.notes = target_notes
hg.notes_url = target_notes_url
hg.action_url = target_action_url
hg.save()

Behind the scenes all the member names of an object are stored in a dict. You can access this dict with vars(obj) or obj.__dict__. You can then use the update method of the dict to add a set of names to your object.
eg.
class SomeClass:
def __str__(self):
return "SomeClass({})".format(
", ".join(
"{}={!r}".format(key, value)
for key, value in self.__dict__.items()
)
)
__repr__ = __str__
target_names = ['var_a', 'var_b', 'var_c']
target_values = [1, 2, 3]
target = dict(zip(target_names, target_values))
assert target == {'var_a': 1, 'var_b': 2, 'var_c': 3}
s = SomeClass()
vars(s).update(target)
assert hasattr(s, 'var_a')
assert s.var_a == 1
print(s) # prints SomeClass(var_c=3, var_a=1, var_b=2)

What Should Be In My Return?

I am using Python to parse an XML response from a SOAP web-service. The Customer returns about 40 values as you can see below. I would like to know if there is a way to make it so I only have to type one thing into my return statement and get all of the values returned? I tried to use for customer in doc.findall('.//Customer').itervalues() and that did not work as I believe that call is for dictionaries. Same results and reasoning behind .iteritems.
doc = ET.fromstring(response_xml)
for customer in doc.findall('.//Customer'):
customer_number = customer.findtext('CustomerNumber')
customer_first_name = customer.findtext('FirstName')
customer_last_name = customer.findtext('LastName')
customer_middle_name = customer.findtext('MiddleName')
customer_salutation = customer.findtext('Salutation')
customer_gender = customer.findtext('Gender')
customer_language = customer.findtext('Language')
customer_address1 = customer.findtext('Address1')
customer_address2 = customer.findtext('Address2')
customer_address3 = customer.findtext('Address3')
customer_city = customer.findtext('City')
customer_county = customer.findtext('County')
customer_state_code = customer.findtext('StateCode')
customer_zip_code = customer.findtext('ZipCode')
customer_phone_number = customer.findtext('PhoneNumber')
customer_business_phone = customer.findtext('BusinessPhone')
customer_business_ext = customer.findtext('BusinessExt')
customer_fax_number = customer.findtext('FaxNumber')
customer_birth_date = customer.findtext('BirthDate')
customer_drivers_license = customer.findtext('DriversLicense')
customer_contact = customer.findtext('Contact')
customer_preferred_contact = customer.findtext('PreferredContact')
customer_mail_code = customer.findtext('MailCode')
customer_tax_exempt_Number = customer.findtext('TaxExmptNumber')
customer_assigned_salesperson = customer.findtext('AssignedSalesperson')
customer_type = customer.findtext('CustomerType')
customer_preferred_phone = customer.findtext('PreferredPhone')
customer_cell_phone = customer.findtext('CellPhone')
customer_page_phone = customer.findtext('PagePhone')
customer_other_phone = customer.findtext('OtherPhone')
customer_other_phone_desc = customer.findtext('OtherPhoneDesc')
customer_email1 = customer.findtext('Email1')
customer_email2 = customer.findtext('Email2')
customer_optional_field = customer.findtext('OptionalField')
customer_allow_contact_postal = customer.findtext('AllowContactByPostal')
customer_allow_contact_phone = customer.findtext('AllowContactByPhone')
customer_allow_contact_email = customer.findtext('AllowContactByEmail')
customer_business_phone_ext = customer.findtext('BusinessPhoneExtension')
customer_internatinol_bus_phone = customer.findtext('InternationalBusinessPhone')
customer_international_cell = customer.findtext('InternationalCellPhone')
customer_external_x_reference_key = customer.findtext('ExternalCrossReferenceKey')
customer_international_fax = customer.findtext('InternationalFaxNumber')
customer_international_other_phone = customer.findtext('InternationalOtherPhone')
customer_international_home_phone = customer.findtext('InternationalHomePhone')
customer_preferred_name = customer.findtext('CustomerPreferredName')
customer_international_pager = customer.findtext('InternationalPagerPhone')
customer_preferred_lang = customer.findtext('PreferredLanguage')
customer_last_change_date = customer.findtext('LastChangeDate')
customer_vehicles = customer.findtext('Vehicles')
customer_ccid = customer.findtext('CCID')
customer_cccd = customer.findtext('CCCD')
webservice.close()
return

I would write that as a generator function yielding dicts where the key matches the findtext argument, e.g.:
fields = ['CustomerNumber', 'FirstName', 'LastName',
# ...
]
for customer in doc.findall('.//Customer'):
yield dict((f, customer.findtext(f)) for f in fields)

You either want to return a list of dicts:
customers = []
for customer in doc.findall('.//Customer'):
customer_dict = {}
customer_dict['number'] = customer.findtext('CustomerNumber')
customer_dict['first_name'] = customer.findtext('FirstName')
customer_dict['last_name'] = customer.findtext('LastName')
# ad nauseum
customers.append(customer_dict)
webservice.close()
return customers
Or you make a Customer class that handles this, and you return a list of customer instances.

I would use a dictionary of dictionaries:
doc = ET.fromstring(response_xml)
customers = {}
cust_dict = {}
for customer in doc.findall('.//Customer'):
cust_dict['customer_number'] = customer.findtext('CustomerNumber')
cust_dict['customer_first_name'] = customer.findtext('FirstName')
cust_dict['customer_last_name'] = customer.findtext('LastName')
snip snip...
customers[customer_number] = cust_dict # or whatever property you want to use to identify each customer, I'm assuming customer_number is some sort of id number
webservice.close()
return customers
That is if you don't have a class you can use to create a Customer object.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parse xml w/ xsd to CSV with Python? - python

Related

'int' object is not subscriptable - Python using Pandas

What is the best way to parse large XML and genarate a dataframe with the data in the XML (with python or else)?

parse xml to pandas data frame in python

Trying to avoid creating numerous variables in python

What Should Be In My Return?

Categories

Resources