How to parse multiple xml files and pass through attributes into csv? - python

I need to parse a directory of xml files into one large csv file, I need certain attributes under the element 'Param' (attributes are 'Name' and 'PNum'). There is another XML file in the directory called Content.xml which I can get all the names of the other XML files and set them as the FileName. The issue is that I cannot figure out how to get these attributes in each XML file as each XML file has a different organisation and some don't seem to have these attributes in the first place.
I have written code that works for one of the XML files in the directory that outputs a CSV file with all the relevant information.
import xml.etree.ElementTree as ET
import csv
import os
FileName = '------.xml'
tree = ET.parse(FileName)
root = tree.getroot()[4]
csv_out = open('CsvOut', 'w')
csvwriter = csv.writer(csv_out)
count = 0
for child in root:
generation = []
parameters = []
if count == 0:
csv_head = ['Generation', 'Parameter Name', 'Parameter Number']
csvwriter.writerow(csv_head)
count = count + 1
gen = FileName[:-4]
generation.append(gen)
parameters.append(generation)
name = child.get('Name')
parameters.append(name)
num = child.get('PNum')
parameters.append(num)
csvwriter.writerow(parameters)
csv_out.close()

I rather simple and you can do it in two steps:
First, enumerate all xml files in the directory
Perform your code over these files
import xml.etree.ElementTree as ET
import csv
import os
from glob import glob
# create csv writer
csv_out = open('CsvOut', 'w')
csvwriter = csv.writer(csv_out)
# write the header
csv_head = ['Generation', 'Parameter Name', 'Parameter Number']
csvwriter.writerow(csv_head)
# iterate over the xml files in the current directory
for FileName in glob("*.xml"):
tree = ET.parse(FileName)
root = tree.getroot()[4]
for child in root:
generation = []
parameters = []
gen = FileName[:-4]
generation.append(gen)
parameters.append(generation)
name = child.get('Name')
parameters.append(name)
num = child.get('PNum')
parameters.append(num)
csvwriter.writerow(parameters)
# after iterating, close the csv file
csv_out.close()

Related

Python convert xml files to csv

I have a directory that contain several xml files that I would like to able to treat all of them, one by one and export them as CSV files.
Individually, It works perfectly with the script below:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('D:/scripts/xml/download_xml_1.xml')
data_out = open('D:/scripts/csv/output_1.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
But I'm going crazy to find the solution to do it, one by one and this where I am so far:
import xml.etree.ElementTree as ET
import csv
import os
for my_files in os.listdir('D:/scripts/xml/'):
tree = ET.parse(my_files)
data_out = open('D:/scripts/csv/'+ my_files[:-4] +'.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
Any help would be greatly appreciated.
Simply generalize your process in a defined method that receives a file name as input. Then, iteratively pass file names to it. Also, consider with context manager to open text connection without need to close.
import os
import csv
import xml.etree.ElementTree as ET
xml_path = r'D:\scripts\xml'
csv_path = r'D:\scripts\csv'
# DEFINED METHOD
def xml_to_csv(xml_file):
csv_file = os.path.join(csv_path, f'Output_{xml_file[:-4]}.csv')
tree = ET.parse(os.path.join(xml_path, xml_file))
with open(csv_file, 'w', newline='', errors='ignore') as data_out:
csvwriter = csv.writer(data_out)
col_names = ['Fichier', 'No. de document', 'Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = [elem.find('FileType').text,
elem.find('DocumentNumber').text,
elem.find('Title').text]
csvwriter.writerow(row)
# FILE ITERATION
for f in os.listdir(xml_path):
xml_to_csv(f)

with path.open('r', encoding="utf-8") as file: AttributeError: 'generator' object has no attribute 'open'

I am not really sure how to access the file names and do the necessary changes as written in the script? I am trying to access some files which are inside the folders.
I also want to use these files as shown in line
item = etree.Element('language', attrib={"lang": path.parent.name, "status": "Reviewed"})
import pathlib
import functools
import operator
import lxml.etree as etree
from lxml.builder import ElementMaker
ATTRIB = {"xsi": "test.xsd", "xmlns": "http://www.w3.org/2001/XMLSchema-instance"}
def is_element(node):
return hasattr(node, 'attrib') and 'name' in node.attrib
def create_plural(item):
pass
def main():
cwd = pathlib.Path.cwd()
directories = list(filter(lambda path: path.is_dir(), cwd.iterdir()))
langs = [path.name for path in directories]
files = map(operator.methodcaller('glob', '*.xml'), directories)
#trees = dict.fromkeys(unique_names, dict())
for path in files:
with path.open('r', encoding="utf-8") as file:
tree = etree.parse(file)
root = tree.getroot()
name = xml_path.with_suffix('').with_suffix('').name
out_tree = trees[name]
for child in filter(is_element, root):
id = child.attrib['name']
text = child.text
if id not in out_tree:
out_tree[id] = list()
item = etree.Element('language', attrib={"lang": path.parent.name, "status": "Reviewed"})
if child.tag == "plurals":
item.text = create_plural(child)
else:
item.text = etree.CDATA(text)
out_tree[id].append(item)
if __name__ == '__main__':
main()
#name = '{}.strings.xml'.format(xml_file.with_suffix('').name) # name of the file
#out_p = out_path / lang / name # path of the output file where it should be located
#out_p.parent.resolve().mkdir(parents=True, exist_ok=True) # make directory
#text = etree.tostring(root, xml_declaration=True, pretty_print=True, encoding="utf-8")
#with out_p.open('wb') as file:
# file.write(text) ```
Instead of:
with path.open('r', encoding="utf-8") as file:
tree = etree.parse(file)
You can pass a filename (string) directly to parse:
tree = etree.parse(path)
path in your example is a string so it doesn't have an open function.
Maybe you meant:
with open(path, 'r', encoding="utf-8") as file:
tree = etree.parse(file)
If you trying to find xml file names in the current directory:
[f for f in os.listdir('.') if f.endswith('.xml')]
The issue is this:
files = map(operator.methodcaller('glob', '*.xml'), directories)
glob returns a generator of paths, so file is not a sequence of paths but a sequence of sequences of path.
You need to either itertools.chain.from_iterable the entire thing into a single sequence, or use a nested loop. Or use a comprenension to staight unwrap the entire thing. map makes a lot of sense when you already have a function doing what you need but that's not the case here so comprehensions tend to be prefereable:
files = (
f
for d in directories
for f in d.glob('*.xml')
)

Python, how to get number of unique ids per file and store filename and unique ids for each file in csv file?

I am working on a project to count the unique commenters in a chat and store the file name and number of commenters of that chat in a csv for each file. However, the code I have now is opening all of the documents and counting all the commenters across the multiple files. So, instead of getting the individual unique commenters per file, it is counting all the commenters across the multiple files. There are 10 unique commenters across all the files, however, I need to be able to see the number of unique commenters for each file and store that data in the csv file(see Desired Output for csv file picture). I feel like I am very close but I am stuck. Can anyone help with this issue or suggest other methods in doing this?
import os, sys, json
from collections import Counter
import csv
filename=""
filepath = ""
jsondata = ""
dictjson = ""
commenterid = []
FName = []
UList = []
TextFiles = []
UCommenter = 0
def get_FilePathList():
for root, dirs, files in os.walk("/Users/ammcc/Desktop/"):
for file in files:
##Find File with specific ending
if file.endswith("chatinfo.txt"):
path = "/Users/ammcc/Desktop/"
##Get the File Path of the file
filepath = os.path.join(path,file)
##Get the Filename of the file ending in chatinfo.txt
head, filename = os.path.split(filepath)
##Append all Filepaths of files ending in chatinfo.txt to TextFiles array/list
TextFiles.append(filepath)
##Append all Filenames of files ending in chatinfo.txt to FName array/list
FName.append(filename)
def open_FilePath():
for x in TextFiles:
##Open each filepath in TextFiles one by one
open_file = open(x)
##Read that file line by line
for line in open_file:
##Parse the Json of the file into jsondata
jsondata = json.loads(line)
##Line not needed but, Parse the Json of the file into dictjson as Dictionary
dictjson = json.dumps(jsondata)
## if the field commenter is found in jsondata
if "commenter" in jsondata:
##Then, append the field ["commenter"]["_id"] **(nested value in the json)** into list commenterid
commenterid.append(jsondata["commenter"]["_id"])
##Get and count the unique ids for the commenter
Ucommenter = (len(set(commenterid)))
##Appended that unique count in UList
UList.append(Ucommenter)
## create or append to the Commenter.csv file
with open('Commenter.csv', 'a') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
##Write the individual filename and the unique commenters for that file
filewriter.writerow([filename, Ucommenter])
commenterid.clear()
##Issue: Not counting the commenters for each file and storing the filename and its specific number of commneters in csv.
##the cvs is being created but the rows in the csv is not generating correctly.
##Call the functions
get_FilePathList()
open_FilePath()
Current Output in csv file
Desired Output for csv file
Output after suggestion
Output and code after Nemanja Radojković solution:
**Correct output format, but still not counting the unique commenters per file.
import json, os
import pandas as pd
import numpy as np
from collections import Counter
TextFiles = []
FName = []
csv_rows = []
commenterid = []
unique_id = []
NC = []
for root, dirs, files in os.walk("/Users/ammcc/Desktop/"):
for file in files:
if file.endswith("chatinfo.txt"):
path = "/Users/ammcc/Desktop/"
filepath = os.path.join(path,file)
head, filename = os.path.split(filepath)
TextFiles.append(filepath)
FName.append(filename)
n_commenters = 0
with open(filepath) as open_file:
for line in open_file:
jsondata = json.loads(line)
if "commenter" in jsondata:
commenterid.append(jsondata["commenter"]["_id"])
list_set = set(commenterid)
unique_list = (list(list_set))
for x in list_set:
n_commenters += 1
commenterid.clear()
csv_rows.append([filename, n_commenters])
df = pd.DataFrame(csv_rows, columns=['FileName', 'Unique_Commenters'])
df.to_csv('CommeterID.csv', index=False)
Try this:
import json
import os
import pandas as pd
TextFiles = []
FName = []
csv_rows = []
for root, dirs, files in os.walk("/Users/ammcc/Desktop/"):
for file in files:
##Find File with specific ending
if file.endswith("chatinfo.txt"):
path = "/Users/ammcc/Desktop/"
##Get the File Path of the file
filepath = os.path.join(path,file)
##Get the Filename of the file ending in chatinfo.txt
head, filename = os.path.split(filepath)
##Append all Filepaths of files ending in chatinfo.txt to TextFiles array/list
TextFiles.append(filepath)
##Append all Filenames of files ending in chatinfo.txt to FName array/list
FName.append(filename)
n_commenters = 0
with open(filepath) as open_file:
##Read that file line by line
for line in open_file:
##Parse the Json of the file into jsondata
jsondata = json.loads(line)
## if the field commenter is found in jsondata
if "commenter" in jsondata:
n_commenters += 1
csv_rows.append([filename, n_commenters])
df = pd.DataFrame(csv_rows, columns=['filename', 'n_commenters'])
df.to_csv('some_filename.csv', index=False)

Only portions of data are being written to a cvs file, rest is missing

I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
data_head.append(MediaId)
# Family = member.find('Family').tag
data_head.append(Content)
data_head.append(ClassId)
data_head.append(Family)
# Species = member.find('Species').tag
data_head.append(Species)
# Genus = member.find('Genus').tag
data_head.append(Genus)
csvwriter.writerow(data_head)
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#else:
#MediaId = root.findall('MediaId').text
data.append(MediaId)
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
data.append(Family)
#Species = member.find('Species').text
data.append(Species)
#Genus = member.find('Genus').text
data.append(Genus)
csvwriter.writerow(data)
data_labels.close()
#print (data)
else:
missingfiles = missingfiles +1
missfiles = []
missfiles.append(inputfilename)
csvwriter2.writerow(missfiles)
print ("missing", missingfiles, "files")
data_labels.close()
missing_files.close()
print ("done")
Open the csv in append mode ,else you are just overwriting the same file.
I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
root[2].text,
root[4].text,
root[5].text,
root[6].text,
root[7].text,
root[8].text]
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in xml_path_list:
writer.writerow(parse_xml_file(xml_path))
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in missing_files:
writer.writerow([os.path.basename(xml_path)])
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)
Usage:
parse_work_dir("/path/to/your/working/dir")

Python - stuck at reading a specific row from a csv

I need to add columns to a "matched" shape file based on a csv. I have one last step to complete which is to get the value to enter into the shp from the csv.
I get
readCSV[rowID] Traceback (most recent call last): File "", line 1, in TypeError: '_csv.reader'
object is not subscriptable
The stripped down CSV is
The files look like
The code mataches OVL_CAT + OVL2_DESC to the File Name.
I then get the code to add a column called LGA_CODE and need to populate it with '583094' which is row 2, column 1...how do I get this when I can't call FileList2 to get row 2 from the csv (3 in the example below but 2 in python)?
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 2
Compare_Column2 = 3
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rowID=rowID+1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if str(status) == 'True':
List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile
#print SearchString
if str(SearchString in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_list=header.split(delimiter)
# Process matching files
for fl in List:
header_count=0
for header in header_list:
dfStore=fl
#arcpy.AddField_management(dfStore, str(header) ,'TEXT')
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
if field.endswith(filename):
rowID=field[:field.find('_')]
with open(AOI, 'rb') as f:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
## arcpy.CalculateField_management(fl, header, text,"PYTHON_9.3")
=== UPDATED CODE BASED ON COMMENTS -it's all working find if anyone needs it.
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 3
Compare_Column2 = 4
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rows=[]
#FinalList=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
#FinalList.append()
rowID+=1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if status:
#List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile, SearchString
if str(SearchString[SearchString.find('_')+1:] in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_listT=header.split(delimiter)
header_list=[]
for hdr in header_listT:
header_list.append(arcpy.ValidateTableName(hdr)[:10])
# Process matching files
columnID=1
for fl in List:
header_count=0
for header in header_list:
print header
dfStore=fl
try:
arcpy.AddField_management(dfStore, str(header) ,'TEXT')
except:
pass
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
#print header, field
if field.endswith(filename):
#print 'FOUND......................'
column_count=len(fl)
if columnID < len(header_list):
rowID=int(field[:field.find('_')])
text = rows[rowID][columnID]
print filename, header, text
columnID+=1
arcpy.CalculateField_management(fl, header, "text" ,"PYTHON_9.3")
#arcpy.CalculateField_management("P:/2012/273_CCRC_Townplanning_Datasets/Working/scratch/OM_011/OM_011_Waterway_Envelopes_ccrc.shp","LGA_CODE","5","PYTHON","#")
Your problem is in these two lines:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
csv.reader is an iterable over the lines of the file; it cannot be directly indexed. You could use islice to get the element you want (islice(readCSV, rowID, rowID+1).next()), though a neater solution would just be to store a dictionary mapping rowID to the AOI row when you read it the first time (in the SearchStrings loop):
FileList = csv.reader(open(AOI))
SearchStrings = []
rows = []
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
rowID=rowID+1
... # later
rowID=int(field[:field.find('_')])
text = rows[rowID][1]

Categories

Resources