Python convert xml files to csv - python

I have a directory that contain several xml files that I would like to able to treat all of them, one by one and export them as CSV files.
Individually, It works perfectly with the script below:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('D:/scripts/xml/download_xml_1.xml')
data_out = open('D:/scripts/csv/output_1.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
But I'm going crazy to find the solution to do it, one by one and this where I am so far:
import xml.etree.ElementTree as ET
import csv
import os
for my_files in os.listdir('D:/scripts/xml/'):
tree = ET.parse(my_files)
data_out = open('D:/scripts/csv/'+ my_files[:-4] +'.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
Any help would be greatly appreciated.

Simply generalize your process in a defined method that receives a file name as input. Then, iteratively pass file names to it. Also, consider with context manager to open text connection without need to close.
import os
import csv
import xml.etree.ElementTree as ET
xml_path = r'D:\scripts\xml'
csv_path = r'D:\scripts\csv'
# DEFINED METHOD
def xml_to_csv(xml_file):
csv_file = os.path.join(csv_path, f'Output_{xml_file[:-4]}.csv')
tree = ET.parse(os.path.join(xml_path, xml_file))
with open(csv_file, 'w', newline='', errors='ignore') as data_out:
csvwriter = csv.writer(data_out)
col_names = ['Fichier', 'No. de document', 'Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = [elem.find('FileType').text,
elem.find('DocumentNumber').text,
elem.find('Title').text]
csvwriter.writerow(row)
# FILE ITERATION
for f in os.listdir(xml_path):
xml_to_csv(f)

Related

Iterating file paths in python dataframe

I have a data frame that has all the file paths that is called filedataframe. My code works for pulling what I want from a individual xml file. But it is currently set up for a single file. How do I make this where it will iterate through the data frame filedataframe to use the file path? I want to add rootId, file_Name, unique_ID, and employee_badge with there respective file path.
import re
import pathlib
import os
import pandas as pd
import xml.etree.ElementTree as ET
filesdataframe = []
# example path would be Defined Contributions,
xmlfile = (r'INVESTING.cdm')
#WE are parseing it.
tree = ET.parse(xmlfile)
#We then get the root.
root = tree.getroot()
for elm in root.findall('.//{object}IntraModelReport'):
print(elm.text)
for Model in root.findall('.//{object}IntraModelReport'):
rootId = elm.attrib
file_Name = Model.find("{attribute}Code").text
unique_ID = Model.find("{attribute}ObjectID").text
employee_badge = Model.find("{attribute}Creator").text
print(rootId,file_Name, unique_ID, employee_badge)
Try this.
import re
import pathlib
import os
import pandas as pd
import xml.etree.ElementTree as ET
from typing import Dict, List
def process_single_xmlfile(xmlfile: str, verbose: bool=False) -> Dict:
tree = ET.parse(xmlfile)
root = tree.getroot()
for elm in root.findall('.//{object}IntraModelReport'):
print(elm.text)
package: Dict = {'xmlfile': xmlfile, 'models': []}
for Model in root.findall('.//{object}IntraModelReport'):
rootId = elm.attrib
file_Name = Model.find("{attribute}Code").text
unique_ID = Model.find("{attribute}ObjectID").text
employee_badge = Model.find("{attribute}Creator").text
if verbose:
print(rootId, file_Name, unique_ID, employee_badge)
package['models'].append(dict(
rootId = rootId,
file_Name = file_Name,
unique_ID = unique_ID,
employee_badge = employee_badge,
))
return package
#### LOOP OVER
# all the results will be stored in this list
extracts: List[Dict] = []
# xmlfiles is a list of xml filenames: You need to provide this
# you can replace "xmlfiles" with your "filedataframe".
for xmlfile in xmlfiles:
# set verbose=True to enable printing
extracts.append(process_single_xmlfile(xmlfile, verbose=False))

How process csv to hdfs using apache nifi?

Hello guys hope you doing well !
I have some csv files want to put them in hdfs and if a file already exists it should append his content to the existing content I tries a script in python but with no results
import os
import pandas as pd
from os import path
import sys,json
import csv
from csv import writer,reader
data = json.load(sys.stdin)
technologies = ['KPI_2G_NPO','GPRS']
old_path = data["old.path"]
filename = data["filename"]
old_path = old_path.replace("C:\\Users\\12\\Desktop\\APACHE~1\\NIFI-1~1.1\\","")
old_path = old_path.replace("/","")
old_path_list = old_path.split('\\')
def append_list_as_row(file_name, list_of_elem):
with open(file_name, 'a+', newline='') as write_obj:
csv_writer = writer(write_obj)
csv_writer.writerow(list_of_elem)
df = pd.read_csv(data["new.path"]+data["filename"])
columns = df.columns.values.tolist()
for tech in technologies:
if (tech in filename and old_path_list[0] in filename):
if path.exists("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv"):
header_saved = True
with open(data["new.path"]+data["filename"]) as file2:
header = next(file2)
header = next(file2)
if header_saved:
for line in file2:
append_list_as_row("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv",list(line.split(",")))
os.remove(data["new.path"]+data["filename"])
else:
df.to_csv("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv")
os.remove(data["new.path"]+data["filename"])
and here's my nifi pipline picture

How to parse multiple xml files and pass through attributes into csv?

I need to parse a directory of xml files into one large csv file, I need certain attributes under the element 'Param' (attributes are 'Name' and 'PNum'). There is another XML file in the directory called Content.xml which I can get all the names of the other XML files and set them as the FileName. The issue is that I cannot figure out how to get these attributes in each XML file as each XML file has a different organisation and some don't seem to have these attributes in the first place.
I have written code that works for one of the XML files in the directory that outputs a CSV file with all the relevant information.
import xml.etree.ElementTree as ET
import csv
import os
FileName = '------.xml'
tree = ET.parse(FileName)
root = tree.getroot()[4]
csv_out = open('CsvOut', 'w')
csvwriter = csv.writer(csv_out)
count = 0
for child in root:
generation = []
parameters = []
if count == 0:
csv_head = ['Generation', 'Parameter Name', 'Parameter Number']
csvwriter.writerow(csv_head)
count = count + 1
gen = FileName[:-4]
generation.append(gen)
parameters.append(generation)
name = child.get('Name')
parameters.append(name)
num = child.get('PNum')
parameters.append(num)
csvwriter.writerow(parameters)
csv_out.close()
I rather simple and you can do it in two steps:
First, enumerate all xml files in the directory
Perform your code over these files
import xml.etree.ElementTree as ET
import csv
import os
from glob import glob
# create csv writer
csv_out = open('CsvOut', 'w')
csvwriter = csv.writer(csv_out)
# write the header
csv_head = ['Generation', 'Parameter Name', 'Parameter Number']
csvwriter.writerow(csv_head)
# iterate over the xml files in the current directory
for FileName in glob("*.xml"):
tree = ET.parse(FileName)
root = tree.getroot()[4]
for child in root:
generation = []
parameters = []
gen = FileName[:-4]
generation.append(gen)
parameters.append(generation)
name = child.get('Name')
parameters.append(name)
num = child.get('PNum')
parameters.append(num)
csvwriter.writerow(parameters)
# after iterating, close the csv file
csv_out.close()

Only portions of data are being written to a cvs file, rest is missing

I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
data_head.append(MediaId)
# Family = member.find('Family').tag
data_head.append(Content)
data_head.append(ClassId)
data_head.append(Family)
# Species = member.find('Species').tag
data_head.append(Species)
# Genus = member.find('Genus').tag
data_head.append(Genus)
csvwriter.writerow(data_head)
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#else:
#MediaId = root.findall('MediaId').text
data.append(MediaId)
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
data.append(Family)
#Species = member.find('Species').text
data.append(Species)
#Genus = member.find('Genus').text
data.append(Genus)
csvwriter.writerow(data)
data_labels.close()
#print (data)
else:
missingfiles = missingfiles +1
missfiles = []
missfiles.append(inputfilename)
csvwriter2.writerow(missfiles)
print ("missing", missingfiles, "files")
data_labels.close()
missing_files.close()
print ("done")
Open the csv in append mode ,else you are just overwriting the same file.
I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
root[2].text,
root[4].text,
root[5].text,
root[6].text,
root[7].text,
root[8].text]
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in xml_path_list:
writer.writerow(parse_xml_file(xml_path))
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in missing_files:
writer.writerow([os.path.basename(xml_path)])
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)
Usage:
parse_work_dir("/path/to/your/working/dir")

Read CSV from within Zip File

I have a directory of zip files (approximately 10,000 small files), within each is a CSV file I am trying to read and split into a number of different CSV files.
I managed to write the code to split the CSV files from a directory of CSVs, shown below, that reads the first atttribute of the CSV, and depending what it is write it to the relevent CSV.
import csv
import os
import sys
import re
import glob
reader = csv.reader(open("C:/Projects/test.csv", "rb"), delimiter=',', quotechar='"')
write10 = csv.writer(open('ouput10.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
write15 = csv.writer(open('ouput15.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
write10.writerow(headings10)
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
write15.writerow(headings15)
for row in reader:
type = row[0]
if "10" in type:
write10.writerow(row)
elif "15" in type:
write15.writerow(row)
So I am now trying to read the Zip files rather than wasting time extracting them first.
This is what I have so far after following as many tutorials as I have found
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv = '.'.join([dataFile, 'csv'])
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv))
reader = csv.reader(data)
for row in reader:
print row
However and error gets thrown
AttributeError: 'str' object has no attribute 'reader'
Hopefully someone can show me how to change my CSV reading code that works to read the Zip file.
Much appreciated
Tim
Simple fix. You're overriding the csv module with your local csv variable. Just change the name of that variable:
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file)) #don't forget this line!
reader = csv.reader(data)
for row in reader:
print row

Categories

Resources