I need to remove sheets that names are in array. Unfortunetely this: tempWb.remove(wsToRemoveNameArray[wsToRemoveIndex]) , and this:
del tempWb[wsToRemoveNameArray[wsToRemoveIndex]] dont want to work with my code:
Anyone know how to deal with it?
def splitExcelFiles(InputPath, OutputPath, fileNameArray):
for file in range(0, len(fileNameArray)):
tempFile = InputPath + '\\' +fileNameArray[file]
tempWb = load_workbook(tempFile)
wsToRemoveNameArray = []
if(len(tempWb.sheetnames)==1):
#new wb
tempWb.save(str(OutputPath) + '\\' + str(tempWb.sheetnames) + '.xlsx')
else:
for ws in range (0,len(tempWb.sheetnames)):
newName = tempWb.sheetnames[ws]
wsToRemoveNameArray = []
#copyWs = tempWb.copy_worksheet[ws]
# #This section will save the names to remove other sheets from ws
for wsToRemoveName in range (0,len(tempWb.sheetnames)):
if newName != tempWb.sheetnames[wsToRemoveName]:
#print(tempWb.sheetnames[wsToRemoveName])
wsToRemoveNameArray.append(str(tempWb.sheetnames[wsToRemoveName]))
for wsToRemoveIndex in range (0, len(wsToRemoveNameArray)):
# tem
#tempWb.remove(wsToRemoveNameArray[wsToRemoveIndex])
#del tempWb[wsToRemoveNameArray[wsToRemoveIndex]]
# tempWb.
# print(len(wsToRemoveNameArray))
tempWb.save(str(OutputPath) + '\\' + newName + '.xlsx')
First things first, some general tips:
Use the pathlib library whenever you deal with Paths. It makes things a lot easier. There is never a good reason to include the path delimiter in your code.
In python, it's common to write variables and functions with an underscore: save_path instead of savePath
Now that we have that out of the way, here is a tested example. Just change the directories to match yours.
from openpyxl import load_workbook, Workbook
from pathlib import Path
import shutil
def make_absolute_path(path, name, suffix=".xlsx"):
input_file_path = path / name
return input_file_path.with_suffix(suffix)
def remove_all_sheets_except_filename(input_path, output_path, filenames):
output_files_path = []
for i in range(0, len(filenames)):
input_file_path = make_absolute_path(input_path, filenames[i])
output_file_path = make_absolute_path(output_path, filenames[i])
if not Path.is_file(input_file_path):
print(f"Skipping {input_file_path}: Not valid file " f"path. ")
continue
shutil.copyfile(input_file_path, output_file_path)
wb_source: Workbook = load_workbook(filename=output_file_path)
sheets = wb_source.worksheets
if len(sheets) == 1:
save_path = make_absolute_path(output_path, str(wb_source.sheetnames[0]))
wb_source.save(save_path)
output_files_path.append(str(save_path))
else:
for sheet in wb_source.sheetnames:
if not sheet == input_file_path.stem:
wb_source.remove(wb_source[sheet])
if len(wb_source.worksheets) == 1:
save_path = make_absolute_path(
output_path, str(wb_source.sheetnames[0])
)
wb_source.save(save_path)
output_files_path.append(str(save_path))
else:
print(
f"Failed to process {input_file_path} with following "
f"sheets: {','.join(wb_source.worksheets)}."
)
raise ValueError("")
return output_files_path
def main():
# Adjust to where you have the xlsx files and where you want them
input_directory = Path("path/to/your/xlsx/files")
output_directory = Path("path/to/the/output/directory")
file_names = ["input", "foo", "bar"]
paths = remove_all_sheets_except_filename(
input_directory, output_directory, file_names
)
if __name__ == "__main__":
main()
``
Does Python have any built-in functionality to add a number to a filename if it already exists?
My idea is that it would work the way certain OS's work - if a file is output to a directory where a file of that name already exists, it would append a number or increment it.
I.e: if "file.pdf" exists it will create "file2.pdf", and next time "file3.pdf".
I ended up writing my own simple function for this. Primitive, but gets the job done:
def uniquify(path):
filename, extension = os.path.splitext(path)
counter = 1
while os.path.exists(path):
path = filename + " (" + str(counter) + ")" + extension
counter += 1
return path
In a way, Python has this functionality built into the tempfile module. Unfortunately, you have to tap into a private global variable, tempfile._name_sequence. This means that officially, tempfile makes no guarantee that in future versions _name_sequence even exists -- it is an implementation detail.
But if you are okay with using it anyway, this shows how you can create uniquely named files of the form file#.pdf in a specified directory such as /tmp:
import tempfile
import itertools as IT
import os
def uniquify(path, sep = ''):
def name_sequence():
count = IT.count()
yield ''
while True:
yield '{s}{n:d}'.format(s = sep, n = next(count))
orig = tempfile._name_sequence
with tempfile._once_lock:
tempfile._name_sequence = name_sequence()
path = os.path.normpath(path)
dirname, basename = os.path.split(path)
filename, ext = os.path.splitext(basename)
fd, filename = tempfile.mkstemp(dir = dirname, prefix = filename, suffix = ext)
tempfile._name_sequence = orig
return filename
print(uniquify('/tmp/file.pdf'))
I was trying to implement the same thing in my project but #unutbu's answer seemed too 'heavy' for my needs so I came up with following code finally:
import os
index = ''
while True:
try:
os.makedirs('../hi'+index)
break
except WindowsError:
if index:
index = '('+str(int(index[1:-1])+1)+')' # Append 1 to number in brackets
else:
index = '(1)'
pass # Go and try create file again
Just in case someone stumbled upon this and requires something simpler.
If all files being numbered isn't a problem, and you know beforehand the name of the file to be written, you could simply do:
import os
counter = 0
filename = "file{}.pdf"
while os.path.isfile(filename.format(counter)):
counter += 1
filename = filename.format(counter)
recently I encountered the same thing and here is my approach:
import os
file_name = "file_name.txt"
if os.path.isfile(file_name):
expand = 1
while True:
expand += 1
new_file_name = file_name.split(".txt")[0] + str(expand) + ".txt"
if os.path.isfile(new_file_name):
continue
else:
file_name = new_file_name
break
Let's say you already have those files:
This function generates the next available non-already-existing filename, by adding a _1, _2, _3, ... suffix before the extension if necessary:
import os
def nextnonexistent(f):
fnew = f
root, ext = os.path.splitext(f)
i = 0
while os.path.exists(fnew):
i += 1
fnew = '%s_%i%s' % (root, i, ext)
return fnew
print(nextnonexistent('foo.txt')) # foo_3.txt
print(nextnonexistent('bar.txt')) # bar_1.txt
print(nextnonexistent('baz.txt')) # baz.txt
Since the tempfile hack A) is a hack and B) still requires a decent amount of code anyway, I went with a manual implementation. You basically need:
A way to Safely create a file if and only if it does not exist (this is what the tempfile hack affords us).
A generator for filenames.
A wrapping function to hide the mess.
I defined a safe_open that can be used just like open:
def iter_incrementing_file_names(path):
"""
Iterate incrementing file names. Start with path and add " (n)" before the
extension, where n starts at 1 and increases.
:param path: Some path
:return: An iterator.
"""
yield path
prefix, ext = os.path.splitext(path)
for i in itertools.count(start=1, step=1):
yield prefix + ' ({0})'.format(i) + ext
def safe_open(path, mode):
"""
Open path, but if it already exists, add " (n)" before the extension,
where n is the first number found such that the file does not already
exist.
Returns an open file handle. Make sure to close!
:param path: Some file name.
:return: Open file handle... be sure to close!
"""
flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY
if 'b' in mode and platform.system() == 'Windows':
flags |= os.O_BINARY
for filename in iter_incrementing_file_names(path):
try:
file_handle = os.open(filename, flags)
except OSError as e:
if e.errno == errno.EEXIST:
pass
else:
raise
else:
return os.fdopen(file_handle, mode)
# Example
with safe_open("some_file.txt", "w") as fh:
print("Hello", file=fh)
I haven't tested this yet but it should work, iterating over possible filenames until the file in question does not exist at which point it breaks.
def increment_filename(fn):
fn, extension = os.path.splitext(path)
n = 1
yield fn + extension
for n in itertools.count(start=1, step=1)
yield '%s%d.%s' % (fn, n, extension)
for filename in increment_filename(original_filename):
if not os.isfile(filename):
break
This works for me.
The initial file name is 0.yml, if it exists, it will add one until meet the requirement
import os
import itertools
def increment_filename(file_name):
fid, extension = os.path.splitext(file_name)
yield fid + extension
for n in itertools.count(start=1, step=1):
new_id = int(fid) + n
yield "%s%s" % (new_id, extension)
def get_file_path():
target_file_path = None
for file_name in increment_filename("0.yml"):
file_path = os.path.join('/tmp', file_name)
if not os.path.isfile(file_path):
target_file_path = file_path
break
return target_file_path
import os
class Renamer():
def __init__(self, name):
self.extension = name.split('.')[-1]
self.name = name[:-len(self.extension)-1]
self.filename = self.name
def rename(self):
i = 1
if os.path.exists(self.filename+'.'+self.extension):
while os.path.exists(self.filename+'.'+self.extension):
self.filename = '{} ({})'.format(self.name,i)
i += 1
return self.filename+'.'+self.extension
I found that the os.path.exists() conditional function did what I needed. I'm using a dictionary-to-csv saving as an example, but the same logic could work for any file type:
import os
def smart_save(filename, dict):
od = filename + '_' # added underscore before number for clarity
for i in np.arange(0,500,1): # I set an arbitrary upper limit of 500
d = od + str(i)
if os.path.exists(d + '.csv'):
pass
else:
with open(d + '.csv', 'w') as f: #or any saving operation you need
for key in dict.keys():
f.write("%s,%s\n"%(key, dictionary[key]))
break
Note: this appends a number (starting at 0) to the file name by default, but it's easy to shift that around.
This function validates if the file name exists using regex expresion and recursion
def validate_outfile_name(input_path):
filename, extension = os.path.splitext(input_path)
if os.path.exists(input_path):
output_path = ""
pattern = '\([0-9]\)'
match = re.search(pattern, filename)
if match:
version = filename[match.start() + 1]
try: new_version = int(version) + 1
except: new_version = 1
output_path = f"{filename[:match.start()]}({new_version}){extension}"
output_path = validate_outfile_name(output_path)
else:
version = 1
output_path = f"{filename}({version}){extension}"
return output_path
else:
return input_path
I've implemented a similar solution with pathlib:
Create file-names that match the pattern path/<file-name>-\d\d.ext. Perhaps this solution can help...
import pathlib
from toolz import itertoolz as itz
def file_exists_add_number(path_file_name, digits=2):
pfn = pathlib.Path(path_file_name)
parent = pfn.parent # parent-dir of file
stem = pfn.stem # file-name w/o extension
suffix = pfn.suffix # NOTE: extension starts with '.' (dot)!
try:
# search for files ending with '-\d\d.ext'
last_file = itz.last(parent.glob(f"{stem}-{digits * '?'}{suffix}"))
except:
curr_no = 1
else:
curr_no = int(last_file.stem[-digits:]) + 1
# int to string and add leading zeros
curr_no = str(last_no).zfill(digits)
path_file_name = parent / f"{stem}-{curr_no}{suffix}"
return str(path_file_name)
Pls note: That solution starts at 01 and will only find file-pattern containing -\d\d!
def create_file():
counter = 0
filename = "file"
while os.path.isfile(f"dir/{filename}{counter}.txt"):
counter += 1
print(f"{filename}{counter}.txt")
A little bit later but there is still something like this should work properly, mb it will be useful for someone.
You can use built-in iterator to do this ( image downloader as example for you ):
def image_downloader():
image_url = 'some_image_url'
for count in range(10):
image_data = requests.get(image_url).content
with open(f'image_{count}.jpg', 'wb') as handler:
handler.write(image_data)
Files will increment properly. Result is:
image.jpg
image_0.jpg
image_1.jpg
image_2.jpg
image_3.jpg
image_4.jpg
image_5.jpg
image_6.jpg
image_7.jpg
image_8.jpg
image_9.jpg
Easy way for create new file if this name in your folder
if 'sample.xlsx' in os.listdir('testdir/'):
i = 2
while os.path.exists(f'testdir/sample ({i}).xlsx'):
i += 1
wb.save(filename=f"testdir/sample ({i}).xlsx")
else:
wb.save(filename=f"testdir/sample.xlsx")
Below is the data in CFS_Config.txt. What this textfile does is to know where the documents have stored and to avoid hardcodes in the program. For instance, if the program is moved to other environment, we only need to change the directory paths in the CFS_Config.txt file.
Folder Path = ../dataprep/source_documents
ED Notes name = ED Notes
ED Notes output = ../dataprep/ED_Notes
This below codes in a python file what it actually does is to read configuration from the CFS_Config.txt mentioned earlier and also to do an auto generated textfile.
The problem encountered is that they tell me the ../dataprep/ED_Notes path was not found. Please do take a look at the codes if need more codes I will try my best to provide, thanks!!! :((
from preprocessing import ednotes_extractor
import os
def read_config():
# open existing file to read configuration
cfs_config_txt = open("..\CFS_Config.txt", "r")
file_list = []
root_dir = ""
ednotes_name = ""
ednotes_output = ""
for line in cfs_config_txt:
file_list.append(line)
if "Folder Path = " in file_list[0]:
root_dir = str(file_list[0])
root_dir = root_dir.replace("Folder Path = ", "")
root_dir = root_dir.replace("\n", "")
if "ED Notes name = " in file_list[1]:
ednotes_name = str(file_list[1])
ednotes_name = ednotes_name.replace("ED Notes name = ", "")
ednotes_name = ednotes_name.replace("\n", "")
if "ED Notes output = " in file_list[2]:
ednotes_output = str(file_list[2])
ednotes_output = ednotes_output.replace("ED Notes output = ", "")
ednotes_output = ednotes_output + ".txt"
ednotes_output = ednotes_output.replace("\n", "")
return root_dir, ednotes_name, ednotes_output
def convert_txt(choices):
root_dir, ednotes_name, ednotes_output = read_config()
if(choices == 1):
# open new file to write string data textfile
text_file = open(ednotes_output, 'w', encoding='utf-8')
text_file.write("cat_id|content\n")
for filename in os.listdir(root_dir):
source_directory = root_dir + '/' + filename
arr = ednotes_extractor.get_ednotes(source_directory)
# open existing file to append the items in the array to the previously written textfile
text_file = open(ednotes_output, 'a', encoding='utf-8')
for item in arr:
text_file.write("%s\n" % item)
elif(choices==2):
print("Hi")
I am trying to merge 1000+ pdf pages, and it works with under 750 pages. If I open more than 750 it processes it, but output file is 0 bytes.
from PyPDF3 import PdfFileWriter, PdfFileReader, PdfFileMerger
import os
import sys
from collections import OrderedDict
import win32file
win32file._setmaxstdio(8192)
print(win32file._getmaxstdio())
sys.setrecursionlimit(30000)
nameOfFile = os.path.basename(os.getcwd())
#get page number
def getPageNr(arg1):
stro = str(arg1)
stro=stro.replace('.pdf', '')
listR = stro.split(' - ')
listR[len(listR)-1] = listR[len(listR)-1].replace('-','')
listR[len(listR)-1] = listR[len(listR)-1].replace('Page ','')
pgNr=int(listR[len(listR)-1])
return pgNr
currentFolder = os.getcwd()
pdffiles = [os.path.join(name)
for root, dirs, files in os.walk(currentFolder)
for name in files
if name.endswith((".pdf"))]
#create dictionary and get whole list
di={}
#direct copy and create key from page number on back and value is original list
for string in pdffiles:
di.setdefault(getPageNr(string),str(string))
#sort it by keys
di2 = OrderedDict(sorted(di.items()))
pdffiles.clear()
for key,values in di2.items():
pdffiles.append(values)
#put a correction
pageAt = 0
adder = 421
pageAt = pageAt + adder
#add global variables for page in bookmark
mainTitlePage = 0
secondTitlePage = 0
thirdTitlePage = 0
#define globals for bookmarks
mainTitle = ''
SecondTitle = ''
thirdTitle = ''
#define previous bookmarks
lastMainTitle = ''
lastSecondTitle = ''
lastThirdTitle = ''
#if main title is same as next page
isSame = True
#start Merger
editer = PdfFileMerger()
#start main loop
while pageAt<(adder+2000) and pageAt<len(pdffiles) and isSame:
#break filename to titles
titles = pdffiles[pageAt].split(' - ')
#break next page for titles
titlesNext = pdffiles[pageAt+1].split(' - ')
#get titles
mainTitle = titles[0]
secondTitle = titles[1]
if not titlesNext[0] == mainTitle:
isSame = False
hasThird = False
if len(titles)>4:
thirdTitle = titles[2]
hasThird = True
else:
thirdTitle = None
hasThird = False
#open individual page
kStream = open(pdffiles[pageAt], 'rb')
inputK = PdfFileReader(kStream)
#test if titles are changing
if not mainTitle == lastMainTitle:
KmainParent = editer.addBookmark(mainTitle, 0)
if not secondTitle == lastSecondTitle:
secondTitlePage = pageAt-adder
#print(secondTitle)
Kparent = editer.addBookmark(secondTitle, secondTitlePage, KmainParent)
if hasThird:
if not thirdTitle == lastThirdTitle:
thirdTitlePage = pageAt-adder
Mparent = editer.addBookmark(thirdTitle, thirdTitlePage, Kparent)
editer.addBookmark(titles[3], pageAt-adder, Mparent)
else:
editer.addBookmark(titles[2], pageAt-adder, Kparent)
#merge page with fixed bookmarks
editer.merge((pageAt - adder), inputK)
#get titles and save them for future
lastMainTitle = mainTitle
lastSecondTitle = secondTitle
lastThirdTitle = thirdTitle
#go to next page
pageAt += 1
#get name for output file
nameOfFile = mainTitle + '.pdf'
print('Saving ' + nameOfFile)
#start new file and export it
outR = open(nameOfFile, 'wb')
editer.write(outR)
outR.close()
kStream.close()
Now it puts all bookmarks, no problem there. But how to process more than 750 pages.
I have increased recursion limit and maxstdio...but if there are 1000 or more pages, merged file is 0 bytes, but process takes minute or two, so it is processing.
I do not get any of errors.
Can anybody help me to process more than 500 pages
I'm trying to replicate the exporting of a Code Module from an Excel sheet in Python.
The following works in VBA:
Public Sub ExportModules()
Dim wb As Workbook
Set wb = ThisWorkbook
Dim D As String
Dim N
D = ThisWorkbook.Path
For Each VBComp In wb.VBProject.VBComponents
If (VBComp.Type = 1) Then
N = D + "\" + VBComp.Name + ".txt"
VBComp.Export N
End If
Next
End Sub
And I have the following in Python:
import os
import sys
import glob
from win32com.client import Dispatch
scripts_dir = 'folder address'
com_instance = Dispatch("Excel.Application")
com_instance.Visible = False
com_instance.DisplayAlerts = False
for script_file in glob.glob(os.path.join(scripts_dir, "*.xlsm")):
print "Processing: %s" % script_file
(file_path, file_name) = os.path.split(script_file)
objworkbook = com_instance.Workbooks.Open(script_file)
for xlmodule in objworkbook.VBProject.VBComponents:
xlmodule.Export('export file name')
My question is, what do I have to do in Python to replicate the Export of the file as per the VBA code?
Use the default oletools xltrails provides a good way to extract .bas files from .xlsm or other excel files
import os
import shutil
from oletools.olevba3 import VBA_Parser
EXCEL_FILE_EXTENSIONS = ('xlsb', 'xls', 'xlsm', 'xla', 'xlt', 'xlam',)
def parse(workbook_path):
vba_path = workbook_path + '.vba'
vba_parser = VBA_Parser(workbook_path)
vba_modules = vba_parser.extract_all_macros() if vba_parser.detect_vba_macros() else []
for _, _, _, content in vba_modules:
decoded_content = content.decode('latin-1')
lines = []
if '\r\n' in decoded_content:
lines = decoded_content.split('\r\n')
else:
lines = decoded_content.split('\n')
if lines:
name = lines[0].replace('Attribute VB_Name = ', '').strip('"')
content = [line for line in lines[1:] if not (
line.startswith('Attribute') and 'VB_' in line)]
if content and content[-1] == '':
content.pop(len(content)-1)
lines_of_code = len(content)
non_empty_lines_of_code = len([c for c in content if c])
if non_empty_lines_of_code > 0:
if not os.path.exists(os.path.join(vba_path)):
os.makedirs(vba_path)
with open(os.path.join(vba_path, name + '.bas'), 'w') as f:
f.write('\n'.join(content))
if __name__ == '__main__':
for root, dirs, files in os.walk('.'):
for f in dirs:
if f.endswith('.vba'):
shutil.rmtree(os.path.join(root, f))
for f in files:
if f.endswith(EXCEL_FILE_EXTENSIONS):
parse(os.path.join(root, f))
I have tried it and it works great.
Ref: https://www.xltrail.com/blog/auto-export-vba-commit-hook