Iterate through each XML file - python

So currently i have a code that passed the information to Report Portal from a XML file, this xml file located on its own folder and it applies to many folder. Currently, the parser only pass the last xml data that are stored in the memory even though it recognize all the other file
this is my code for now:
launch = service.start_launch(name=launch_name,
attributes=rp_attributes,
start_time=timestamp(),
description=launch_doc)
r_path='\\\\esw-fs01\\esw_niagara_no_bck\\BuildResults\\master\\0.1.52.68_390534\\installation_area\\autotestlogs_top'
root = os.listdir(r_path)
for entry in root:
subdir_path = os.path.join(r_path, entry) # create the absolute path of the subdir
if os.path.isdir(subdir_path): # check if it is a folder
subdir_entries = os.listdir(subdir_path) # get the content of the subdir
for subentry in subdir_entries:
if subentry.endswith('.xml'):
subentry_path = os.path.join(subdir_path, subentry)
tree = ET.parse(subentry_path)
root=tree.getroot()
for subentry_path in root.iter('entry'):
if subentry_path.get('type') == "TM":
if suite_item_id != None:
service.finish_test_item(item_id=suite_item_id, end_time=timestamp(), status=tm_verdict)
suite=subentry_path.find('name').text
description=subentry_path.find('messages').text
verdict=subentry_path.find('verdict').text
if verdict=="True":
tm_verdict="PASSED"
elif verdict=="False":
tm_verdict="FAILED"
suite_item_id = service.start_test_item(name=suite,
description=description,
attributes=rp_attributes,
start_time=timestamp(),
item_type="SUITE")
if subentry_path.get('type') == "TR":
name = subentry_path.find('name').text
verdict = subentry_path.find('verdict').text
link = subentry_path.find('link').text
duration = 10
description = subentry_path.find('messages').text
if verdict=="True":
verdict="PASSED"
elif verdict=="False":
verdict="FAILED"
start_time=timestamp()
item_id = service.start_test_item(name=name,
description=description,
start_time=start_time,
parent_item_id=suite_item_id,
item_type="STEP",
parameters={"key1": "val1",
"key2": "val2"})
the goal is to make it read all the files, any help will be apreciated

You could first build a list of paths, then in the second loop parse the files.
# don't use 'root' here
src = os.listdir(r_path)
files = list()
for entry in src:
subdir_path = os.path.join(r_path, entry)
if os.path.isdir(subdir_path):
subdir_entries = os.listdir(subdir_path)
for subentry in subdir_entries:
if subentry.endswith('.xml'):
subentry_path = os.path.join(subdir_path, subentry)
files.append(subentry_path)
for f in files:
tree = ET.parse(f)
root = tree.getroot()
for subentry_path in root.iter('entry'):
...
As a side note, it's advisable to use something more canonical to get all the files, like glob:
import glob
filelist = glob.glob(os.path.join(rpath, "**/*.xml"), recursive=True)
filelist is now a list of paths to all the xml files in the source directory. This will save you a couple of lines and indentations.

Related

Optimize execution time for retrieving data from xml and copy images

the purpose of the following code is to copy image files from one directory to another directory and reorganize the images in a hierarchical structure based on information extracted from XML files.
from bs4 import BeautifulSoup as bs
import shutil
import os
import glob
campaign_name = "CAMPAIGN_2020"
xml_directory = r'XML_DIRECTORY'
picture_directory = r'PICTURE_DIRECTORY'
output_directory = r'OUTPUT_DIRECTORY'
def copy_files(content, picture_files):
bs_content = bs(content, "lxml")
images = bs_content.find_all("images")
for picture in picture_files:
for i, image_group in enumerate(images):
for image in image_group.find_all('img'):
if os.path.basename(image['src']) == os.path.basename(picture):
src = image['src']
station = image['station']
first_field = image.parent.parent.data['first_field']
second_field = image.parent.parent.data['second_field']
start = int(image.parent.parent.data['start'])
end = int(image.parent.parent.data['end'])
length = start - end
class_name = image.parent.parent.assignment['class_name']
number = image.parent.parent.assignment['number']
img_nr = int(image['img_nr'])
location = image.parent.parent.assignment['location']
date = image.parent.parent['date']
# set the complete picture path
picture_path = f'{class_name}{number}\{first_field}_{second_field}_{length}_{start}_{end}\{adjust_date(date)}\{campaign_name}\{adjust_location(location)}\{adjust_img_nr(img_nr)}\{station.zfill(5)}.jpg'
# create new subdirectories if they do not already exist
os.makedirs(os.path.join(output_directory, os.path.dirname(picture_path)), exist_ok=True)
src_file = picture # original picture path
dst_file = os.path.join(output_directory, picture_path) # assembled target path
shutil.copy(src_file, dst_file)
picture_list = []
for pic in glob.glob(picture_directory + '\**\*.jpg', recursive=True): # consider files in all subdirectories that end with .jpg, adjust if necessary
picture_list.append(pic)
for path in os.listdir(xml_directory):
if path.endswith(".xml"): # only consider files that end with .xml
with open(os.path.join(xml_directory, path), "r") as file:
xml_content = file.readlines()
xml_content = "".join(xml_content)
copy_files(xml_content, picture_list)
I tested the code and it works for the most part. To copy 20 pictures the tool needs around 2 hours, so i have to drasticly improve the execution time. How can I do that?
To give you an idea: I have around 8k xml files and around 400k pictures :D

with path.open('r', encoding="utf-8") as file: AttributeError: 'generator' object has no attribute 'open'

I am not really sure how to access the file names and do the necessary changes as written in the script? I am trying to access some files which are inside the folders.
I also want to use these files as shown in line
item = etree.Element('language', attrib={"lang": path.parent.name, "status": "Reviewed"})
import pathlib
import functools
import operator
import lxml.etree as etree
from lxml.builder import ElementMaker
ATTRIB = {"xsi": "test.xsd", "xmlns": "http://www.w3.org/2001/XMLSchema-instance"}
def is_element(node):
return hasattr(node, 'attrib') and 'name' in node.attrib
def create_plural(item):
pass
def main():
cwd = pathlib.Path.cwd()
directories = list(filter(lambda path: path.is_dir(), cwd.iterdir()))
langs = [path.name for path in directories]
files = map(operator.methodcaller('glob', '*.xml'), directories)
#trees = dict.fromkeys(unique_names, dict())
for path in files:
with path.open('r', encoding="utf-8") as file:
tree = etree.parse(file)
root = tree.getroot()
name = xml_path.with_suffix('').with_suffix('').name
out_tree = trees[name]
for child in filter(is_element, root):
id = child.attrib['name']
text = child.text
if id not in out_tree:
out_tree[id] = list()
item = etree.Element('language', attrib={"lang": path.parent.name, "status": "Reviewed"})
if child.tag == "plurals":
item.text = create_plural(child)
else:
item.text = etree.CDATA(text)
out_tree[id].append(item)
if __name__ == '__main__':
main()
#name = '{}.strings.xml'.format(xml_file.with_suffix('').name) # name of the file
#out_p = out_path / lang / name # path of the output file where it should be located
#out_p.parent.resolve().mkdir(parents=True, exist_ok=True) # make directory
#text = etree.tostring(root, xml_declaration=True, pretty_print=True, encoding="utf-8")
#with out_p.open('wb') as file:
# file.write(text) ```
Instead of:
with path.open('r', encoding="utf-8") as file:
tree = etree.parse(file)
You can pass a filename (string) directly to parse:
tree = etree.parse(path)
path in your example is a string so it doesn't have an open function.
Maybe you meant:
with open(path, 'r', encoding="utf-8") as file:
tree = etree.parse(file)
If you trying to find xml file names in the current directory:
[f for f in os.listdir('.') if f.endswith('.xml')]
The issue is this:
files = map(operator.methodcaller('glob', '*.xml'), directories)
glob returns a generator of paths, so file is not a sequence of paths but a sequence of sequences of path.
You need to either itertools.chain.from_iterable the entire thing into a single sequence, or use a nested loop. Or use a comprenension to staight unwrap the entire thing. map makes a lot of sense when you already have a function doing what you need but that's not the case here so comprehensions tend to be prefereable:
files = (
f
for d in directories
for f in d.glob('*.xml')
)

Need to upload sub-dirs and their contents, not just files in current dir

A script was supplied to me in order to upload files to a cloud bucket. You input the dir where the files you want to upload are and bingo bango, done.
What needs to happen is that there are additional sub dirs with their own files in them that I would like to transfer as well based on the input of the root dir. They would need to retain their tree structure relative to the root dir input.
Using the current code I get a write error/access denied fail. I know this is because the for loop is using os.listdir which can't parse the extra sub dirs and files but I'm not sure how to modify.
I attempted to get all the information I needed using os.walk and parsing that out. I verified with some print tests that it was looking in the right place for everything. However I hit a wall when I got this error when running the script:
folder\folder\lib\ntpath.py", line 76, in join
path = os.fspath(path)
TypeError: expected str, bytes or os.PathLike object, not list
I understand that something is being generated as a list when it shouldn't be but I'm not sure how to go about this...
This is the original script provided to me below. I have added the variable at the top just to be a little less abstract.
local_directory_path = 'C:\folder\folder\sync\FROM_LOCAL_UPLOAD'
def upload_folder_to_cloud(self, mount_id, local_directory_path):
''' This method will list every file at the local_directory_path and then for each,
it will call the api method athera.sync.upload_file for every file in your local directory
'''
_, destination_folder = os.path.split(local_directory_path)
if not destination_folder:
self.logger.error("Make sure the provided 'local_directory_path' does not end with a '/' or a '\\'")
sys.exit(2)
destination_folder = destination_folder + "/"
self.logger.info("Folder = {}".format(destination_folder))
for filename in os.listdir(local_directory_path):
destination_path = destination_folder + filename
filepath = os.path.join(local_directory_path, filename)
with open(filepath, "rb") as f:
_, err = self.client.upload_file(self.group_id, mount_id, f, destination_path=destination_path,)
if err != None:
self.logger.error(err)
sys.exit(4)
return destination_folder
This is what I modified it to as a test:
for root, dirs, files in os.walk(local_directory_path):
srcFile = (os.path.join(files))
srcRoot = (os.path.join(root))
rootSplit = os.path.normpath(srcRoot).split(os.path.sep)
srcDirs = '/'.join(rootSplit[4:])
src = str('fixLocalFolder') + '/' + str(srcDirs) +'/'+ (files)
dst = str(srcDirs) + '/' + (files)
destination_folder = str(srcRoot) + "/"
destination_path = str(destination_folder) + str(srcFile)
filepath = os.path.join((str(srcDirs), str(srcFile)))
with open(filepath, "rb") as f:
_, err = self.client.upload_file(
self.group_id,
mount_id,
f,
destination_path=destination_path,
)
if err != None:
self.logger.error(err)
sys.exit(4)
return destination_folder
I do not code for a living so I am sure I am not going about this the right way. I apologize for any code atrocities in advance. Thank you!
I do see some issues in that code, even without testing it. Something like the following might work for that loop. (Note! Untested!).
for root, dirs, files in os.walk(local_directory_path):
# Iterate through files in the currently processed directory
for current_file in files:
# Full path to file
src_file = os.path.join(root, current_file)
# Get the sub-path relative the original root.
sub_path = os.path.relpath(root, start=destination_folder)
# Get the destination path
destination_path = os.path.join(sub_path, current_file)
with open(src_file, "rb") as f:
_, err = self.client.upload_file(
self.group_id,
mount_id,
f,
destination_path=destination_path,
)
if err != None:
self.logger.error(err)
sys.exit(4)
I believe your central problem was misunderstanding what os.walk gives you. It gives you listing of each directory (and subdirectory), one after another.
Thus the values of one iterations might look like (when listing /mydir):
# First iteration:
root = "/mydir"
dirs = ["subdir", ...]
files = ["something.doc", "something else.txt"]
# Second iteration:
root = "/mydir/subdir"
dirs = ["sub-sub-dir1", ...]
files = ["file1.txt", "file2.txt", ...]

Delete Zipped Folder in Python

I'm literally new to python and trying to learn stuff. I have a python script that basically unzips from a compressed zipped folder from source and extracts all to the destination folder. Added I also wanted to delete the source contents once it is extracted. How would I achieve this ? thanks for help in advance!
Basically,inside this path "L:\Python\Source Zipped files" I have multiple zipped folders. My query, unzips each folder and extracts to the final destination. I'm looking for an approach, like first when it unzips the first folder, and extracts and then it should be deleted from the source folder. Included a snippet of how the source folder looks like.
enter image description here
Here is my query
import os
import zipfile
import shutil
import json
data_dir = r'L:\Python\Source Zipped files'
temp_dir = r'L:\Python\temp1'
new_dir = r'L:\Python\temp2'
final_dir = r'L:\Python\Destination Unzipped files'
big_list = os.listdir(data_dir)
archive_count = 0
file_count = 152865
basename1 = os.path.join(final_dir,'GENERIC_ROUGHDRAFT')
basename2 = os.path.join(final_dir,'XACTDOC')
my_time()
archive_count = len(big_list)
logging.info('Unzipping {} archives...'.format(archive_count))
for folder in big_list:
prior_count = file_count
logging.info('Starting: {}'.format(folder))
try:
shutil.rmtree(temp_dir)
except FileNotFoundError:
pass
os.mkdir(temp_dir)
with zipfile.ZipFile(os.path.join(data_dir,folder),mode='r') as a_zip:
a_zip.extractall(path = temp_dir)
archive_count += 1
logging.info('Cumulative total of {} archive(s) unzipped'.format(archive_count))
bigger_list = os.listdir(temp_dir)
logging.info('Current archive contains {} subfolders'.format(len(bigger_list)))
for sub_folder in bigger_list:
with zipfile.ZipFile(os.path.join(temp_dir,sub_folder),mode='r') as b_zip:
b_zip.extractall(path = new_dir)
file1 = "%s (%d).%s" % (basename1, file_count, 'xml')
file2 = "%s (%d).%s" % (basename2, file_count, 'xml')
shutil.copy(os.path.join(new_dir, 'GENERIC_ROUGHDRAFT.xml'), file1)
shutil.copy(os.path.join(new_dir, 'XACTDOC.xml'), file2)
file_count += 1
logging.info('{} subfolders unzipped'.format(file_count - prior_count))
my_time()
logging.info('Total of {0} files -- {1} pairs -- should be in {2}'.format(2*(file_count-1), file_count-1, final_dir))
time.sleep(1)
my_time()

Getting paths of each file of a directory into an Array in python

Im trying to put into an array files[] the paths of each file from the Data folder but when I try to go into subfolders I want it to be able to go down to the end of the Data file, for example I can read files in a subfolder of the main folder Data which im trying to get a list of all the paths of each file into an array but it doesn't go deeper it does not access the subfolder of the subfolder of Data without writing a loop. Want I want is a loop which has infinit depth of view of files in the Data folder so I can get all the file paths.
For example this is what I get:
['Data/DataReader.py', 'Data/DataReader - Copy.py', 'Data/Dat/DataReader.py', 'Data/fge/er.txt']
This is what I want but it can still go into deeper folders:
['Data/DataReader.py', 'Data/DataReader - Copy.py', 'Data/Dat/DataReader.py', 'Data/fge/er.txt', 'Data/fge/Folder/dummy.png', 'Data/fge/Folder/AnotherFolder/data.dat']
This is my current path, what would i need to add or change?
import os
from os import walk
files = []
folders = []
for (dirname, dirpath, filename) in walk('Data'):
folders.extend(dirpath)
files.extend(filename)
break
filecount = 0
for i in files:
i = 'Data/' + i
files[filecount] = i
filecount += 1
foldercount = 0
for i in folders:
i = 'Data/' + i
folders[foldercount] = i
foldercount += 1
subfolders = []
subf_files = []
for i in folders:
for (dirname, dirpath, filename) in walk(i):
subfolders.extend(dirpath)
subf_files.extend(filename)
break
subf_files_count = 0
for a in subf_files:
a = i + '/'+a
files = files
files.append(a)
print files
subf_files = []
print files
print folders
Thanks a lot!
Don't understand what are your trying to do, especially why you break your walk after the first element:
import os
files = []
folders = []
for (path, dirnames, filenames) in os.walk('Data'):
folders.extend(os.path.join(path, name) for name in dirnames)
files.extend(os.path.join(path, name) for name in filenames)
print files
print folders

Categories

Resources