Parse xml and write a csv with header columns - python

I am trying to parse an xml file containing meteo data and to write some value in a csv file.
I'm not sure that this code is elegant but it works.
from qgis.PyQt.QtCore import *
import requests
import xml.etree.ElementTree as ET
# url of xml to parse
baseUrl = ('http://www.arpa.veneto.it/bollettini/meteo/h24/img08/0144.xml')
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
for stazione in tree.iter('STAZIONE'):
idstaz= stazione.find('IDSTAZ').text
for sensore in stazione.iter('SENSORE'):
id= sensore.find('ID').text
for dati in sensore.iter('DATI'):
ist = dati.get('ISTANTE')
vm = dati.find('VM').text
f = open('D:/GIS/_Temp/result.csv', 'a')
print >> f, idstaz, id, ist, vm
f.close()
I'm not sure that this code is elegant but it works.
144 300000864 201701080100 -4.2
144 300000864 201701080200 -4.5
144 300000864 201701080300 -4.8
144 300000864 201701080400 -5.5
...
but I don't know how to add the headers to the columns.

Open the file before the for loop and add header to file
from qgis.PyQt.QtCore import *
import requests
import xml.etree.ElementTree as ET
# url of xml to parse
baseUrl = ('http://www.arpa.veneto.it/bollettini/meteo/h24/img08/0144.xml')
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
f = open('D:/GIS/_Temp/result.cvs', 'a')
f.write('STAZIONE,IDSTAZ,SENSORE,ISTANTE')
for stazione in tree.iter('STAZIONE'):
idstaz= stazione.find('IDSTAZ').text
for sensore in stazione.iter('SENSORE'):
id= sensore.find('ID').text
for dati in sensore.iter('DATI'):
ist = dati.get('ISTANTE')
vm = dati.find('VM').text
print >> f, idstaz, id, ist, vm
f.close()

Related

Unable to edit powerpoint XML data points

I'm utilizing python to update data within the xml below, what im trying to do in all is update text within a powerpoint dynamically using an incoming dataframe, by doing so i pull out the xml from the pptx file and can't figure out how to change the text within the xml.
Dataframe:
Old New
0 A.1 Valuation
1 A.2 12000
2 A.3 5.23
3 A.4 Test,Complete
XMLFile: Github Link
XML Snippit:
<a:tc>
<a:txBody>
<a:bodyPr/>
<a:lstStyle/>
<a:p>
<a:pPr algn="l" fontAlgn="auto"/>
<a:r>
<a:rPr lang="en-US" sz="1800" dirty="0">
<a:effectLst/>
</a:rPr>
<a:t>A.1​</a:t>
</a:r>
<a:endParaRPr lang="en-US" sz="1800" b="0" i="0" dirty="0">
<a:solidFill>
<a:srgbClr val="000000"/>
</a:solidFill>
<a:effectLst/>
<a:latin typeface="Calibri" panose="020F0502020204030204" pitchFamily="34" charset="0"/>
</a:endParaRPr>
</a:p>
</a:txBody>
<a:tcPr/>
</a:tc>
Python Code:
import shutil
from pathlib import Path
import lxml.etree as ET
import pandas as pd
xml_file = r'\Desktop\PowerPoint XML\Test\ppt\slides\slide1.xml'
Dataframe = r'\Desktop\PowerPoint XML\Dataframe.xlsx'
df = pd.read_excel(Dataframe)
df['Old'] = df['Old'].astype(str)
df['New'] = df['New'].astype(str)
# open xml file that contains slide data
tree = ET.parse(xml_file)
treeRoot = tree.getroot()
rootString = ET.tostring(treeRoot)
decodedRootString = bytes.decode(rootString)
old = df.Old; new = df.New
# search for old value then replace with new
for elem in treeRoot.iter():
elem.text.replace(old[elem],new[elem])
FinalString=str.encode(decodedRootString)
#save the XML File
root = ET.fromstring(FinalString)
my_tree = ET.ElementTree(root)
with open(xml_file, 'wb') as f:
f.write(ET.tostring(my_tree))
Error Log:
File "\tempCodeRunnerFile.py", line 36, in <module>
elem.text.replace(old[elem],new[elem])
AttributeError: 'NoneType' object has no attribute 'replace'

How can we read JSON data from URL, convert to dataframe, and save as CSV

I'm playing around with some code to read JSON encoded data from a URL, push it into a data frame and save the results to a CSV. The code that I attempted to run is shown below. I think this is pretty close, but something is wrong, because nothing gets downloaded.
import urllib
from urllib.request import urlopen
import json
import pandas as pd
from pandas.io.json import json_normalize
all_links = ['https://www.baptisthealthsystem.com/docs/global/standard-charges/474131755_abrazomaranahospital_standardcharges.json?sfvrsn=9a27928_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621861138_abrazocavecreekhospital_standardcharges.json?sfvrsn=674fd6f_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621809851_abrazomesahospital_standardcharges.json?sfvrsn=13953222_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621811285_abrazosurprisehospital_standardcharges.json?sfvrsn=c8113dcf_2']
for item in all_links:
#print(item)
try:
length = len(item)
first_under = item.find('_') + 1
last_under = item.rfind('?') - 21
file_name = item[first_under:last_under]
print(file_name)
# store the response of URL
response = urlopen(item)
data = json.loads(response.read())
#print(type(data))
data = json.loads(item.read().decode())
df = pd.DataFrame(json_normalize(data, 'metrics'), encoding='mac_roman')
DOWNLOAD_PATH = 'C:\\Users\\ryans\\Desktop\\hospital_data\\' + file_name + '.csv'
urllib.request.urlretrieve(df,DOWNLOAD_PATH)
except Exception as e: print(e)
Any thoughts on what could be wrong here?

Python convert xml files to csv

I have a directory that contain several xml files that I would like to able to treat all of them, one by one and export them as CSV files.
Individually, It works perfectly with the script below:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('D:/scripts/xml/download_xml_1.xml')
data_out = open('D:/scripts/csv/output_1.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
But I'm going crazy to find the solution to do it, one by one and this where I am so far:
import xml.etree.ElementTree as ET
import csv
import os
for my_files in os.listdir('D:/scripts/xml/'):
tree = ET.parse(my_files)
data_out = open('D:/scripts/csv/'+ my_files[:-4] +'.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
Any help would be greatly appreciated.
Simply generalize your process in a defined method that receives a file name as input. Then, iteratively pass file names to it. Also, consider with context manager to open text connection without need to close.
import os
import csv
import xml.etree.ElementTree as ET
xml_path = r'D:\scripts\xml'
csv_path = r'D:\scripts\csv'
# DEFINED METHOD
def xml_to_csv(xml_file):
csv_file = os.path.join(csv_path, f'Output_{xml_file[:-4]}.csv')
tree = ET.parse(os.path.join(xml_path, xml_file))
with open(csv_file, 'w', newline='', errors='ignore') as data_out:
csvwriter = csv.writer(data_out)
col_names = ['Fichier', 'No. de document', 'Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = [elem.find('FileType').text,
elem.find('DocumentNumber').text,
elem.find('Title').text]
csvwriter.writerow(row)
# FILE ITERATION
for f in os.listdir(xml_path):
xml_to_csv(f)

NVD - JSON to CSV with Python

I am trying to download the NVD CVE. Here is my pythoncode:
import requests
import re
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip",r.text):
print(filename)
r_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
with open("nvd/" + filename, 'wb') as f:
for chunk in r_file:
f.write(chunk)
Now I want to write all JSON-files ina csv-file with this format:
Name, Value, Description, ..., ...
Name, Value, Description, ..., ...
Can somebody help me?
The following should get you started, giving you two columns, ID,VendorName,DescriptionandVendorValues`:
import requests
import re
import zipfile
import io
import json
import csv
with open("nvdcve-1.0-2017.json") as f_json:
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['ID', 'VendorName', 'Description', 'VersionValues'])
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip", r.text):
print("Downloading {}".format(filename))
r_zip_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
zip_file_bytes = io.BytesIO()
for chunk in r_zip_file:
zip_file_bytes.write(chunk)
zip_file = zipfile.ZipFile(zip_file_bytes)
for json_filename in zip_file.namelist():
print("Extracting {}".format(json_filename))
json_raw = zip_file.read(json_filename).decode('utf-8')
json_data = json.loads(json_raw)
for entry in json_data['CVE_Items']:
try:
vendor_name = entry['cve']['affects']['vendor']['vendor_data'][0]['vendor_name']
except IndexError:
vendor_name = "unknown"
try:
url = entry['cve']['references']['reference_data'][0]['url']
except IndexError:
url = ''
try:
vv = []
for pd in entry['cve']['affects']['vendor']['vendor_data'][0]['product']['product_data']:
for vd in pd['version']['version_data']:
vv.append(vd['version_value'])
version_values = '/'.join(vv)
except IndexError:
version_values = ''
csv_output.writerow([
entry['cve']['CVE_data_meta']['ID'],
url,
vendor_name,
entry['cve']['description']['description_data'][0]['value'],
version_values])
This downloads the zipfile into memory. It then extracts all files one at a time into memory and converts the json into a Python datas structure using json.loads(). For each entry in CVE_Items it then extracts a couple of the fields and writes them to a CSV file.
As the JSON data is highly structured, you will need to consider how you would want to represent all of the fields in a CSV file. Currently it extras two "useful" fields and stores those.
Alternatively instead of making your own CSV you could work with Pandas:
df = pd.read_json(json_raw)
df.to_csv(f_output)
Remove the csv_output lines. This though would need some extra work to decide on how it should be formatted.

Xml parsing from web response

I'm trying to get response from nominatim to geo-code few thousands of cities.
import os
import requests
import xml.etree.ElementTree as ET
txt = open('input.txt', 'r').readlines()
for line in txt:
lp, region, district, municipality, city = line.split('\t')
baseUrl = 'http://nominatim.openstreetmap.org/search/gb/'+region+'/'+district+'/'+municipality+'/'+city+'/?format=xml'
# eg. http://nominatim.openstreetmap.org/search/pl/podkarpackie/stalowowolski/Bojan%C3%B3w/Zapu%C5%9Bcie/?format=xml
resp = requests.get(baseUrl)
resp.encoding = 'UTF-8' # special diacritics
msg = resp.text
# parse response to get lat & long
tree = ET.parse(msg)
root = tree.getroot()
print tree
but the result is:
Traceback (most recent call last):
File "geo_miasta.py", line 17, in <module>
tree = ET.parse(msg)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 1182, in parse
tree.parse(source, parser)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 647, in parse
source = open(source, "rb")
IOError: [Errno 2] No such file or directory: u'<?xml version="1.0" encoding="UTF-8" ?>\n<searchresults timestamp=\'Tue, 11 Feb 14 21:13:50 +0000\' attribution=\'Data \xa9 OpenStreetMap contributors, ODbL 1.0. http://www.openstreetmap.org/copyright\' querystring=\'\u015awierczyna, Drzewica, opoczy\u0144ski, \u0142\xf3dzkie, gb\' polygon=\'false\' more_url=\'http://nominatim.openstreetmap.org/search?format=xml&exclude_place_ids=&q=%C5%9Awierczyna%2C+Drzewica%2C+opoczy%C5%84ski%2C+%C5%82%C3%B3dzkie%2C+gb\'>\n</searchresults>'
What is wrong with this?
Edit:
Thant to #rob my solution is:
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
import os
import requests
import xml.etree.ElementTree as ET
txt = open('input.txt', 'r').read().split('\n')
for line in txt:
lp, region, district, municipality, city = line.split('\t')
baseUrl = 'http://nominatim.openstreetmap.org/search/pl/'+region+'/'+district+'/'+municipality+'/'+city+'/?format=xml'
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
for place in tree.findall('place'):
location = '{:5f}\t{:5f}'.format(
float(place.get('lat')),
float(place.get('lon')))
f = open('result.txt', 'a')
f.write(location+'\t'+region+'\t'+district+'\t'+municipality+'\t'+city)
f.close()
You are using xml.etree.ElementTree.parse(), which takes a filename or a file object as an argument. But, you are not passing a file or file object in, you are passing a unicode string.
Try xml.etree.ElementTree.fromstring(text).
Like this:
tree = ET.fromstring(msg)
Here is a complete sample program:
import os
import requests
import xml.etree.ElementTree as ET
baseUrl = 'http://nominatim.openstreetmap.org/search/pl/podkarpackie/stalowowolski/Bojan%C3%B3w/Zapu%C5%9Bcie\n/?format=xml'
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
for place in tree.findall('place'):
print u'{:s}: {:+.2f}, {:+.2f}'.format(
place.get('display_name'),
float(place.get('lon')),
float(place.get('lat'))).encode('utf-8')
import os,sys,time
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import parse
tree = ET.parse('D:\Reddy\BankLoanAcctService_transactionInq.xml')
root=tree.getroot()
for TrxnEffDt in root.iter('TrxnEffDt'):
new_TrxnEffDt= str(time.strftime("%y-%m-%d"))
TrxnEffDt=str(new_TrxnEffDt)
filename2 ="D:\Reddy\BankLoanAcctService_transactionInq2.txt"
r=open(filename2,'w')
sys.stdout =r

Categories

Resources