Generate XML files based on rows in CSV

Generate XML files based on rows in CSV - python

I have a CSV and would like generate an XML file based on each row in the CSV.
Right now it creates an XML file but only with the last row in the CSV. How can I modify this script to generate an XML file for EACH row. And ideally have the filename based on the Column: "File / Entity Name". See below for what I currently have, Thanks!
# CSV module
import csv
# Stuff from the XML module
from xml.etree.ElementTree import Element, SubElement, tostring, ElementTree
import xml.etree.ElementTree as etree
# Topmost XML element
root = Element('root')
number = Element('number')
# Open a file
with open(r'U:\PROJECTS\Technical Graphics\book1.csv') as f:
for row in csv.DictReader(f):
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
for k, v in row.items():
child = SubElement(root, k)
child.text = v
reader = csv.DictReader(f)
tree.write(open(r'U:\PROJECTS\Technical Graphics\test.xml','w'))
print tostring(root)

You set the value of Root here:
for row in csv.DictReader(f):
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
filename = row.items()[7] # where 7 is the column your interested in
for k, v in row.items():
child = SubElement(root, k)
child.text = v
reader = csv.DictReader(f)
tree.write(open(r'U:\PROJECTS\Technical Graphics\' + filename + '.xml','w'))
print tostring(root)

You only want to create the csv.DictReader() class once, rather than for each iteration of your loop.
Similarly, you only want to create your root XML element once.
Finally, the order of the items returned from row.items() is arbitrary, and not reflective of the order of the fields in the file.
Try this:
# CSV module
import csv
# Stuff from the XML module
from xml.etree.ElementTree import Element, SubElement, tostring, ElementTree
import xml.etree.ElementTree as etree
# Topmost XML element
root = Element('root')
number = Element('number')
# Open a file
with open(r'U:\PROJECTS\Technical Graphics\book1.csv') as f:
root = Element('gmd:MD_Metadata')
tree = ElementTree(root)
reader = csv.DictReader(f)
for row in reader:
xml_row = SubElement(root, "row")
for k in reader.fieldnames:
child = SubElement(xml_row, k)
child.text = row[k]
tree.write(open(r'U:\PROJECTS\Technical Graphics\test.xml','w'))
print tostring(root)

Related

If any potential issues using xpath find/update xml tag text

I wrote a script, based on some of the existing StackOverflow questions, but no one perfectly fit my issues.
The user use xpath to find a XML tag from a given XML file, and update the tag text based on user inputs.
Below is my script using Python 3 (The most difficult part for me is around the namespaces):
import xml.etree.ElementTree as ET
import sys
# user inputs and variables
filename = 'actors.xml'
xpath = 'actor/name'
value = 'test name'
temp_namespace = 'temp_namespace'
# get all namespaces
all_namespaces = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])])
# register namespace
for key in all_namespaces.keys():
ET.register_namespace(key, all_namespaces[key])
# remove all namespace from elements first
# and temp save it to tag attribute
# The below logic is copied from other Stackoverflow answers
# From **Python 3.8**, we can add the parser to insert comments
it = ET.iterparse(filename, parser=ET.XMLParser(target=ET.TreeBuilder(insert_comments=True)))
for _, el in it:
prefix, has_namespace, postfix = el.tag.partition('}')
if has_namespace:
el.tag = postfix
el.set(temp_namespace, prefix + has_namespace)
# find and update
root = it.root
for el in root.findall(xpath):
el.text = str(value)
# get xml comments before root level
doc_comments = []
with open(filename, 'r') as f:
lines = f.readlines()
for line in lines:
if line.startswith('<?xml'):
continue
if line.startswith('<' + root.tag):
break
else:
doc_comments.append(line)
def add_tag_namespace(el):
for sub_el in el:
if temp_namespace in sub_el.attrib.keys():
sub_el.tag = sub_el.attrib[temp_namespace] + sub_el.tag
del sub_el.attrib[temp_namespace]
add_tag_namespace(sub_el)
if temp_namespace in el.attrib.keys():
el.tag = el.attrib[temp_namespace] + el.tag
del el.attrib[temp_namespace]
# add all namespace back
# and delete the temp namespace attribute
add_tag_namespace(root)
# write back to xml file
tree = ET.ElementTree(root)
tree.write(filename, encoding='unicode', xml_declaration=True)
if len(doc_comments) == 0:
sys.exit()
# write xml comments before root back
lines = []
# first read all lines
with open(filename, 'r') as f:
lines = f.readlines()
# second, insert xml comments back into memory
for i, line in enumerate(lines):
if line.startswith('<?xml'):
insert_at = i + 1
for comment in doc_comments:
lines.insert(insert_at, comment)
insert_at += 1
break
# finally, write all contents to file
with open(filename, 'w') as f:
for line in lines:
f.write(line)
actors.xml:
<?xml version="1.0"?>
<actors xmlns:fictional="http://characters.example.com"
xmlns="http://people.example.com">
<actor>
<name>John Cleese</name>
<fictional:character>Lancelot</fictional:character>
<fictional:character>Archie Leach</fictional:character>
</actor>
<actor>
<name>Eric Idle</name>
<fictional:character>Sir Robin</fictional:character>
<fictional:character>Gunther</fictional:character>
<fictional:character>Commander Clement</fictional:character>
</actor>
</actors>

Parse many XML files to one CSV file

The code below takes an XML file and parses specific elements into a CSV file. Regarding the code I had simpler and different code that had a slightly different out, the code below is as an outcome of a lot help from here.
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
tree = ET.parse('thexmlfile.xml')
root = tree.getroot()
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
start_nodes = root.findall('.//START')
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
for sn in start_nodes:
row = defaultdict(str)
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['description_txt'] = ds.text
row['description_num'] = ds.attrib['num']
# all other tags except set data must be parsed before this.
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
I'm trying to make that this code goes to a folder that has many XML files and parses them into one single CSV file. Simply said instead of parsing one XML file , do this for multiple XMLs and parse them to one csv file.
What I would normally do is use os.listdir(): . The code would look something like this
directory = 'C:/Users/docs/FolderwithXMLs'
for filename in os.listdir(directory):
if filename.endswith(".xml"):
#Something here
df.to_csv("./output.csv")
continue
else:
continue
I have tried different ways to implement this into the code from above without success until now. Considering that this process should also be fast.

Try:
from pathlib import Path
directory = 'C:/Users/docs/FolderwithXMLs'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
# <<<<< Indentation was wrong here
for k,v in sn.attrib.items():
row[k] = v
# Rest of the code here.
Hope that helps.

How to write to csv with multiple for in statements

I have a Python 3.7.3 script that reads an XML, parses what I need and is supposed to export the results to CSV. I had to go deeper in the XML tree using a for in loop for one of the fields, which throws off how the other for in statements append to csv.
When running the below, my output file does not list the different V-ID's (refer to the third for child in root... statement), however all the other fields are correct. The V-ID's display correctly when i remove the last for in statement and move the firstFile.write statement 2 tabs to the left, but then I don't have the status, so I know the problem is in the last statement. BTW, if I move the firstFile.write statement all the way to the left, it only returns one row in the csv, but there should be 5.
Is there a way to create a list from the output and then combine them all, or perhaps move the firstFile.write statement two tabs to the left and append the last for in statement to a specific column (essentially breaking up the firstFile.write statement)? Or do you have any other suggestions?
import os
import sys
import glob
import xml.etree.ElementTree as ET
firstFile = open("myfile.csv", "a")
firstFile.write("V-ID,")
firstFile.write("HostName,")
firstFile.write("Status,")
firstFile.write("Comments,")
firstFile.write("Finding Details,")
firstFile.write("STIG Name,")
basePath = os.path.dirname(os.path.realpath(__file__))
xmlFile = os.path.join(basePath, "C:\\Users\\myUserName\\Desktop\\Scripts\\Python\\XMLtest.xml")
tree = ET.parse(xmlFile)
root = tree.getroot()
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}title'):
d = child.text
for child in root:
for children in child.findall('{http://checklists.nist.gov/xccdf/1.2}target'):
b = children.text
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}Group'):
x = (str(child.attrib))
x = (x.split('_')[6])
a = x[:-2]
for child in root:
for children in child:
for childrens in children.findall('{http://checklists.nist.gov/xccdf/1.2}result'):
x = childrens.text
if ('pass' in x):
c = 'Completed'
else:
c = 'Ongoing'
firstFile.write("\n" + a + ',' + b + ',' + c + ',' + ',' + ',' + d)
firstFile.close()

Finally, took about a week to figure this out. I the output to CSV, then read it back into a list for each column, parsed the spaces and wrote it out again. Below is how I did it.
import os
import sys
import glob
import csv
import xml.etree.ElementTree as ET
firstFile = open("myfile.csv", "a")
path = 'C:\\Users\\JT\\Desktop\\Scripts\\Python\\xccdf\\'
for fileName in glob.glob(os.path.join(path, '*.xml')):
with open('C:\\Users\\JT\\Desktop\\Scripts\\Python\\myfile1.csv', 'w', newline='') as csvFile1:
csvWriter = csv.writer(csvFile1, delimiter=',')
# do your stuff
tree = ET.parse(fileName)
root = tree.getroot()
# Stig Title
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}title'):
d = child.text
# hostName
for child in root:
for children in child.findall('{http://checklists.nist.gov/xccdf/1.2}target'):
b = children.text
# V-ID
for child in root.findall('{http://checklists.nist.gov/xccdf/1.2}Group'):
x = (str(child.attrib))
x = (x.split('_')[6])
a = x[:-2]
firstFile.write(a + '\n')
# Status
for child in root:
for children in child:
for childrens in children.findall('{http://checklists.nist.gov/xccdf/1.2}result'):
x = childrens.text
firstFile.write(',' + b + ',' + x + ',' + ',' + ',' + d + '\n')
with open('C:\\Users\\JT\\Desktop\\Scripts\\Python\\myfile.csv', 'r') as csvFile:
csvReader = csv.reader(csvFile, delimiter=',')
vIDs = []
hostNames = []
status = []
stigTitles = []
for line in csvReader:
vID = line[0]
vIDs.append(vID)
try:
hostName = line[1]
hostNames.append(hostName)
except:
pass
try:
state = line[2]
status.append(state)
except:
pass
try:
stigTitle = line[5]
stigTitles.append(stigTitle)
except:
pass
with open('C:\\Users\\JT\\Desktop\\Scripts\\Python\\myfile1.csv', 'a', newline='') as csvFile1:
csvWriter = csv.writer(csvFile1, delimiter=',')
vIDMod = list(filter(None, vIDs))
hostNameMod = list(filter(None, hostNames))
statusMod = list(filter(None, status))
stigTitlesMod = list(filter(None, stigTitles))
csvWriter.writerows(zip(vIDMod, hostNameMod, statusMod, stigTitlesMod))
firstFile.close()

Convert CSV document to XML

I know the question is redundant but I tried all the Python code that I found and modified for my file but they did not work. I need to find a way to convert my file myData.csv in to a XML format file which can be read by a navigator.
I just started to learn Python this month so I'm a beginner. This is my code:
#! usr/bin/python
# -*- coding: utf-8 -*-
import csv, sys, os
from lxml import etree
csvFile = 'myData.csv' # création de la variable pour le fichier csv
reader= csv.reader(open(csvFile), delimiter=';', quoting=csv.QUOTE_NONE) # création d'une variable reader à qui on renvoie le tableau csv
print "<data>"
for record in reader:
if reader.line_num == 1:
header = record
else:
innerXml = ""
dontShow = False
type = ""
for i, field in enumerate(record):
innerXml += "<%s>" % header[i].lower() + field + "</%s>" % header[i].lower()
if i == 1 and field == "0":
type = "Next"
elif type == "" and i == 3 and field == "0":
type = "Next"
elif type == "" and i == 3 and field != "0":
type = "film"
if i == 1 and field == "X":
dontShow = True
if dontShow == False:
xml = "<%s>" % type
xml += innerXml
xml += "</%s>" % type
print xml
print "</data>"

Consider building your XML with dedicated DOM objects and not a concatenation of strings which you can do with the lxml module. Using methods such as Element(), SubElement(), etc. you can iteratively build XML tree from reading CSV data:
import csv
import lxml.etree as ET
headers = ['Titre', 'Realisateur', 'Date_Debut_Evenement', 'Date_Fin_Evenement', 'Cadre',
'Lieu', 'Adresse', 'Arrondissement', 'Adresse_complète', 'Geo_Coordinates']
# INITIALIZING XML FILE
root = ET.Element('root')
# READING CSV FILE AND BUILD TREE
with open('myData.csv') as f:
next(f) # SKIP HEADER
csvreader = csv.reader(f)
for row in csvreader:
data = ET.SubElement(root, "data")
for col in range(len(headers)):
node = ET.SubElement(data, headers[col]).text = str(row[col])
# SAVE XML TO FILE
tree_out = (ET.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8"))
# OUTPUTTING XML CONTENT TO FILE
with open('Output.xml', 'wb') as f:
f.write(tree_out)
Output
<?xml version='1.0' encoding='UTF-8'?>
<root>
<data>
<Titre>1</Titre>
<Realisateur>BUS PALLADIUM</Realisateur>
<Date_Debut_Evenement>CHRISTOPHER THOMPSON</Date_Debut_Evenement>
<Date_Fin_Evenement>21 mai 2009</Date_Fin_Evenement>
<Cadre>21 mai 2009</Cadre>
<Lieu>EXTERIEUR</Lieu>
<Adresse>PLACE</Adresse>
<Arrondissement>PIGALLE</Arrondissement>
<Adresse_complète>75018</Adresse_complète>
<Geo_Coordinates>PLACE PIGALLE 75018 Paris France</Geo_Coordinates>
</data>
<data>
<Titre>2</Titre>
<Realisateur>LES INVITES DE MON PERE</Realisateur>
<Date_Debut_Evenement>ANNE LE NY</Date_Debut_Evenement>
<Date_Fin_Evenement>20 mai 2009</Date_Fin_Evenement>
<Cadre>20 mai 2009</Cadre>
<Lieu>DOMAINE PUBLIC</Lieu>
<Adresse>SQUARE</Adresse>
<Arrondissement>DU CLIGNANCOURT</Arrondissement>
<Adresse_complète>75018</Adresse_complète>
<Geo_Coordinates>SQUARE DU CLIGNANCOURT 75018 Paris France</Geo_Coordinates>
</data>
<data>
<Titre>3</Titre>
<Realisateur>DEMAIN, A L'AUBE</Realisateur>
<Date_Debut_Evenement>GAEL CABOUAT</Date_Debut_Evenement>
<Date_Fin_Evenement>17 avril 2009</Date_Fin_Evenement>
<Cadre>17 avril 2009</Cadre>
<Lieu>EXTERIEUR</Lieu>
<Adresse>RUE</Adresse>
<Arrondissement>QUINCAMPOIX</Arrondissement>
<Adresse_complète>75004</Adresse_complète>
<Geo_Coordinates>RUE QUINCAMPOIX 75004 Paris France</Geo_Coordinates>
</data>
...

(posted as an answer so I can show a code block)
There are a lot of picky details when writing XML. In Python, you should probably use some version of ElementTree to help with that. One good tutorial is Creating XML Documents. Quoting from there:
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
top = Element('top')
comment = Comment('Generated for PyMOTW')
top.append(comment)
child = SubElement(top, 'child')
child.text = 'This child contains text.'
child_with_tail = SubElement(top, 'child_with_tail')
child_with_tail.text = 'This child has regular text.'
child_with_tail.tail = 'And "tail" text.'
child_with_entity_ref = SubElement(top, 'child_with_entity_ref')
child_with_entity_ref.text = 'This & that'
print(tostring(top))
If you use this as an example of how to create a tree of XML elements, you should be able to translate your code into the XML structure you need.

Importing pandas and saving file name:
import pandas as pd
csvFile = 'myData.csv'
The following will read CSV into a pandas data frame, then convert to XML.
df = pd.read_csv(path)
df_xml = df.to_xml()
The below code will create a new file and then save the XML data to a file named "csv2xml"
f = open("csv2xml.xml", "w")
f.write(df_xml)
f.close()

Problems with parsing xml

I have some code that is parsing an xml file and saving it as a csv. I can do this two ways, one by manually downloading the xml file and then parsing it, the other by taking the xml feed directly using ET.fromstring and then parsing. When I go directly I get data errors it appears to be an integrity issue. I am trying to include the xml download in to the code, but I am not quite sure the best way to approach this.
import xml.etree.ElementTree as ET
import csv
import urllib
url = 'http://www.capitalbikeshare.com/data/stations/bikeStations.xml'
connection = urllib.urlopen(url)
data = connection.read()
#I need code here!!!
tree = ET.parse('bikeStations.xml')
root = tree.getroot()
#for child in root:
#print child.tag, child.attrib
locations = []
for station in root.findall('station'):
name = station.find('name').text
bikes = station.find('nbBikes').text
docks = station.find('nbEmptyDocks').text
time = station.find('latestUpdateTime').text
sublist = [name, bikes, docks, time]
locations.append(sublist)
#print 'Station:', name, 'has', bikes, 'bikes and' ,docks, 'docks'
#print locations
s = open('statuslog.csv', 'wb')
w = csv.writer(s)
w.writerows(locations)
s.close()
f = open('filelog.csv', 'ab')
w = csv.writer(f)
w.writerows(locations)
f.close()

What you need is:
root = ET.fromstring(data)
and omit the line of: tree = ET.parse('bikeStations.xml')
As the response from connection.read() returns String, you can directly read the XML string by using fromstring method, you can read more from HERE.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Generate XML files based on rows in CSV - python

Related

If any potential issues using xpath find/update xml tag text

Parse many XML files to one CSV file

How to write to csv with multiple for in statements

Convert CSV document to XML

Problems with parsing xml

Categories

Resources