I have a data frame that has all the file paths that is called filedataframe. My code works for pulling what I want from a individual xml file. But it is currently set up for a single file. How do I make this where it will iterate through the data frame filedataframe to use the file path? I want to add rootId, file_Name, unique_ID, and employee_badge with there respective file path.
import re
import pathlib
import os
import pandas as pd
import xml.etree.ElementTree as ET
filesdataframe = []
# example path would be Defined Contributions,
xmlfile = (r'INVESTING.cdm')
#WE are parseing it.
tree = ET.parse(xmlfile)
#We then get the root.
root = tree.getroot()
for elm in root.findall('.//{object}IntraModelReport'):
print(elm.text)
for Model in root.findall('.//{object}IntraModelReport'):
rootId = elm.attrib
file_Name = Model.find("{attribute}Code").text
unique_ID = Model.find("{attribute}ObjectID").text
employee_badge = Model.find("{attribute}Creator").text
print(rootId,file_Name, unique_ID, employee_badge)
Try this.
import re
import pathlib
import os
import pandas as pd
import xml.etree.ElementTree as ET
from typing import Dict, List
def process_single_xmlfile(xmlfile: str, verbose: bool=False) -> Dict:
tree = ET.parse(xmlfile)
root = tree.getroot()
for elm in root.findall('.//{object}IntraModelReport'):
print(elm.text)
package: Dict = {'xmlfile': xmlfile, 'models': []}
for Model in root.findall('.//{object}IntraModelReport'):
rootId = elm.attrib
file_Name = Model.find("{attribute}Code").text
unique_ID = Model.find("{attribute}ObjectID").text
employee_badge = Model.find("{attribute}Creator").text
if verbose:
print(rootId, file_Name, unique_ID, employee_badge)
package['models'].append(dict(
rootId = rootId,
file_Name = file_Name,
unique_ID = unique_ID,
employee_badge = employee_badge,
))
return package
#### LOOP OVER
# all the results will be stored in this list
extracts: List[Dict] = []
# xmlfiles is a list of xml filenames: You need to provide this
# you can replace "xmlfiles" with your "filedataframe".
for xmlfile in xmlfiles:
# set verbose=True to enable printing
extracts.append(process_single_xmlfile(xmlfile, verbose=False))
Related
I have a folder with 50 .csv files. The .csv files are auto-generated and a results/ output from a process-based model (long and automatically named). For example, sandbox_username_vetch_scaleup_IA_1.csv; sandbox_username_vetch_scaleup_IA_2.csv, and it continues till sandbox_username_vetch_scaleup_IA_50.csv.
I am trying to shorten the file names in a way so that the files are names are IA_1, IA_2 ...up to IA_50 and subsequently the new .csv file name gets added as a column to the data frame. Here is what I have tried so far
# import necessary libraries
import pandas as pd
import os
import glob
import sys
from pathlib import Path
import re
data_p = "/Users/Username/Documents/HV_Scale/CWAD"
output_p = "/Users/Username/Documents/HV_Scale/CWAD"
retval = os.getcwd()
print (retval) # see in which folder you are
os.chdir(data_p) # move to the folder with your data
os.getcwd()
filenames = sorted(glob.glob('*.csv'))
fnames = list(filenames) # get the names of all your files
#print(fnames)
#Loop over
for f in range(len(fnames)):
print(f'fname: {fnames[f]}\n')
pfile = pd.read_csv(fnames[f], delimiter=",") # read in file
#extract filename
filename = fnames[f]
parts = filename.split(".") # giving you the number in file name and .csv
only_id = parts[0].split("_") # if there is a bracket included
# get IA from your file
filestate = pfile["IA"][0] # assuming this is on the first row
filestate = str(filestate)
# get new filename
newfilename = only_id[0]+"-"+filestate+parts[1]
# save your file (don't put a slash at the end of your directories on top)
pfile.to_csv(output_p+"/"+newfilename, index = False, header = True)
Here is the code for adding the csv file name as a column
import glob
import os
import shutil
import sys
import pandas as pd
path = '/Users/Username/Documents/HV_Scale/IA_CWAD/short'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in glob.glob(path+'\*.csv')]
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None )
file_df['file_name'] = file_
df = df.append(file_df)
#However, this adds the old csv file name and not the renamed one
In order to rename and move these files, all you need is:
import glob
import os
import shutil
import sys
SOURCE = '<Your source directory>'
TARGET = '<Your target directory>'
for file in glob.glob(os.path.join(SOURCE, '*_IA_*.csv')):
idx = file.index('_IA_')
filename = file[idx+1:]
target = os.path.join(TARGET, filename)
if os.path.exists(target):
print(f'Target file {target} already exists', file=sys.stderr)
else:
shutil.copy(file, target)
As there's nothing in the OP's question that tries to handle modification of the CSV files, that is left as an exercise for the OP.
Source and target directories should be different otherwise this can lead to ambiguous results
Here is the data from the xml file,
<SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/">
<SOAP-ENV:Header />
<SOAP-ENV:Body>
<ADD_LandIndex_001>
<CNTROLAREA>
<BSR>
<status>ADD</status>
<NOUN>LandIndex</NOUN>
<REVISION>001</REVISION>
</BSR>
</CNTROLAREA>
<DATAAREA>
<LandIndex>
<reportId>AMI100031</reportId>
<requestKey>R3278458</requestKey>
<SubmittedBy>EN4871</SubmittedBy>
<submittedOn>2015/01/06 4:20:11 PM</submittedOn>
<LandIndex>
<agreementdetail>
<agreementid>001 4860</agreementid>
<agreementtype>NATURAL GAS</agreementtype>
<currentstatus>
<status>ACTIVE</status>
<statuseffectivedate>1965/02/18</statuseffectivedate>
<termdate>1965/02/18</termdate>
</currentstatus>
<designatedrepresentative></designatedrepresentative>
</agreementdetail>
</LandIndex>
</LandIndex>
</DATAAREA>
</ADD_LandIndex_001>
</SOAP-ENV:Body>
</SOAP-ENV:Envelope
I want to save in a dataframe : 1) the path and 2) the text of the elements corresponding to the path and only for a the elements that contains a value. So I would like to have something like that :
Path Value
0 Body/ADD_LandIndex_001/CNTROLAREA/BSR/status ADD
1 Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN LandIndex
2 Body/ADD_LandIndex_001/CNTROLAREA/BSR/REVISION 001
I have this little code that does not work ! It returns an empty dataframe, however I can see by the print(d) in the loop of the function that it takes correctly each elements. I don't really see what is wrong ? Anyone can find why it is empty and not working ?
from lxml import etree as et
from collections import defaultdict
import pandas as pd
import os
filename = 'file_try.xml'
namespace = '{http://schemas.xmlsoap.org/soap/envelope/}'
with open(filename, 'rb') as file:
root = et.parse(file).getroot()
tree = et.ElementTree(root)
col_name = ['Path', 'Value']
dataF = pd.DataFrame([],columns = col_name)
def traverse(el,d):
if len(list(el)) > 0:
for child in el:
traverse(child,d)
else:
if el.text is not None:
d = d.append({'Path': tree.getelementpath(el).replace(namespace,''), 'Value' : el.text }, ignore_index = True)
print(d)
return d
df = traverse(root,dataF)
print(df)
df.to_excel("data_2.xlsx")
Try this.
from simplified_scrapy import SimplifiedDoc, utils
rows = []
rows.append(['Path', 'Value'])
xml = utils.getFileContent('file_try.xml')
doc = SimplifiedDoc(xml)
body = doc.select('SOAP-ENV:Body')
def getPathValue(node, path):
path = path + '/' + node['tag'] # Splicing path
children = node.children
if children:
traverseNodes(children, path)
else:
rows.append([path, node.text])
def traverseNodes(nodes, path):
for node in nodes: # Traversing child nodes
getPathValue(node, path)
traverseNodes(body.children, "Body")
# print(rows)
utils.save2csv('data_2.csv', rows)
Result:
[['Body/ADD_LandIndex_001/CNTROLAREA/BSR/status', 'ADD'], ['Body/ADD_LandIndex_001/CNTROLAREA/BSR/NOUN', 'LandIndex'], ['Body/ADD_LandIndex_001/CNTROLAREA/BSR/REVISION', '001'], ['Body/ADD_LandIndex_001/DATAAREA/LandIndex/reportId', 'AMI100031'], ['Body/ADD_LandIndex_001/DATAAREA/LandIndex/requestKey', 'R3278458'],
...
I find my mistake by looking on the answer of #yazz.
Here is the code :
from lxml import etree as et
import pandas as pd
import os
filename = 'file_try.xml'
namespace = '{http://schemas.xmlsoap.org/soap/envelope/}'
with open(filename, 'rb') as file:
root = et.parse(file).getroot()
tree = et.ElementTree(root)
col_name = ['Path', 'Value']
data = []
def traverse(el,d):
if len(list(el)) > 0:
for child in el:
traverse(child,d)
else:
if el.text is not None:
d.append([(tree.getelementpath(el)+str(el.xpath('#Ccy'))).replace(namespace,''), el.text])
print(d)
return d
df = pd.DataFrame(traverse(root,data), columns = col_name)
df.to_excel("data_2.xlsx")
I have a directory that contain several xml files that I would like to able to treat all of them, one by one and export them as CSV files.
Individually, It works perfectly with the script below:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('D:/scripts/xml/download_xml_1.xml')
data_out = open('D:/scripts/csv/output_1.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
But I'm going crazy to find the solution to do it, one by one and this where I am so far:
import xml.etree.ElementTree as ET
import csv
import os
for my_files in os.listdir('D:/scripts/xml/'):
tree = ET.parse(my_files)
data_out = open('D:/scripts/csv/'+ my_files[:-4] +'.csv', 'w',newline='', errors='ignore')
csvwriter = csv.writer(data_out)
col_names = ['Fichier','No. de document','Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = []
filetype = elem.find('FileType').text
row.append(filetype)
documentnumber = elem.find('DocumentNumber').text
row.append(documentnumber)
title = elem.find('Title').text
row.append(title)
csvwriter.writerow(row)
data_out.close()
Any help would be greatly appreciated.
Simply generalize your process in a defined method that receives a file name as input. Then, iteratively pass file names to it. Also, consider with context manager to open text connection without need to close.
import os
import csv
import xml.etree.ElementTree as ET
xml_path = r'D:\scripts\xml'
csv_path = r'D:\scripts\csv'
# DEFINED METHOD
def xml_to_csv(xml_file):
csv_file = os.path.join(csv_path, f'Output_{xml_file[:-4]}.csv')
tree = ET.parse(os.path.join(xml_path, xml_file))
with open(csv_file, 'w', newline='', errors='ignore') as data_out:
csvwriter = csv.writer(data_out)
col_names = ['Fichier', 'No. de document', 'Titre']
csvwriter.writerow(col_names)
root = tree.getroot()
for elem in root.iter(tag='Document'):
row = [elem.find('FileType').text,
elem.find('DocumentNumber').text,
elem.find('Title').text]
csvwriter.writerow(row)
# FILE ITERATION
for f in os.listdir(xml_path):
xml_to_csv(f)
I am not really sure how to access the file names and do the necessary changes as written in the script? I am trying to access some files which are inside the folders.
I also want to use these files as shown in line
item = etree.Element('language', attrib={"lang": path.parent.name, "status": "Reviewed"})
import pathlib
import functools
import operator
import lxml.etree as etree
from lxml.builder import ElementMaker
ATTRIB = {"xsi": "test.xsd", "xmlns": "http://www.w3.org/2001/XMLSchema-instance"}
def is_element(node):
return hasattr(node, 'attrib') and 'name' in node.attrib
def create_plural(item):
pass
def main():
cwd = pathlib.Path.cwd()
directories = list(filter(lambda path: path.is_dir(), cwd.iterdir()))
langs = [path.name for path in directories]
files = map(operator.methodcaller('glob', '*.xml'), directories)
#trees = dict.fromkeys(unique_names, dict())
for path in files:
with path.open('r', encoding="utf-8") as file:
tree = etree.parse(file)
root = tree.getroot()
name = xml_path.with_suffix('').with_suffix('').name
out_tree = trees[name]
for child in filter(is_element, root):
id = child.attrib['name']
text = child.text
if id not in out_tree:
out_tree[id] = list()
item = etree.Element('language', attrib={"lang": path.parent.name, "status": "Reviewed"})
if child.tag == "plurals":
item.text = create_plural(child)
else:
item.text = etree.CDATA(text)
out_tree[id].append(item)
if __name__ == '__main__':
main()
#name = '{}.strings.xml'.format(xml_file.with_suffix('').name) # name of the file
#out_p = out_path / lang / name # path of the output file where it should be located
#out_p.parent.resolve().mkdir(parents=True, exist_ok=True) # make directory
#text = etree.tostring(root, xml_declaration=True, pretty_print=True, encoding="utf-8")
#with out_p.open('wb') as file:
# file.write(text) ```
Instead of:
with path.open('r', encoding="utf-8") as file:
tree = etree.parse(file)
You can pass a filename (string) directly to parse:
tree = etree.parse(path)
path in your example is a string so it doesn't have an open function.
Maybe you meant:
with open(path, 'r', encoding="utf-8") as file:
tree = etree.parse(file)
If you trying to find xml file names in the current directory:
[f for f in os.listdir('.') if f.endswith('.xml')]
The issue is this:
files = map(operator.methodcaller('glob', '*.xml'), directories)
glob returns a generator of paths, so file is not a sequence of paths but a sequence of sequences of path.
You need to either itertools.chain.from_iterable the entire thing into a single sequence, or use a nested loop. Or use a comprenension to staight unwrap the entire thing. map makes a lot of sense when you already have a function doing what you need but that's not the case here so comprehensions tend to be prefereable:
files = (
f
for d in directories
for f in d.glob('*.xml')
)
I need to parse a directory of xml files into one large csv file, I need certain attributes under the element 'Param' (attributes are 'Name' and 'PNum'). There is another XML file in the directory called Content.xml which I can get all the names of the other XML files and set them as the FileName. The issue is that I cannot figure out how to get these attributes in each XML file as each XML file has a different organisation and some don't seem to have these attributes in the first place.
I have written code that works for one of the XML files in the directory that outputs a CSV file with all the relevant information.
import xml.etree.ElementTree as ET
import csv
import os
FileName = '------.xml'
tree = ET.parse(FileName)
root = tree.getroot()[4]
csv_out = open('CsvOut', 'w')
csvwriter = csv.writer(csv_out)
count = 0
for child in root:
generation = []
parameters = []
if count == 0:
csv_head = ['Generation', 'Parameter Name', 'Parameter Number']
csvwriter.writerow(csv_head)
count = count + 1
gen = FileName[:-4]
generation.append(gen)
parameters.append(generation)
name = child.get('Name')
parameters.append(name)
num = child.get('PNum')
parameters.append(num)
csvwriter.writerow(parameters)
csv_out.close()
I rather simple and you can do it in two steps:
First, enumerate all xml files in the directory
Perform your code over these files
import xml.etree.ElementTree as ET
import csv
import os
from glob import glob
# create csv writer
csv_out = open('CsvOut', 'w')
csvwriter = csv.writer(csv_out)
# write the header
csv_head = ['Generation', 'Parameter Name', 'Parameter Number']
csvwriter.writerow(csv_head)
# iterate over the xml files in the current directory
for FileName in glob("*.xml"):
tree = ET.parse(FileName)
root = tree.getroot()[4]
for child in root:
generation = []
parameters = []
gen = FileName[:-4]
generation.append(gen)
parameters.append(generation)
name = child.get('Name')
parameters.append(name)
num = child.get('PNum')
parameters.append(num)
csvwriter.writerow(parameters)
# after iterating, close the csv file
csv_out.close()