I'm working on a program to split excel files into sections of 1000. I can't seem to get it to create a second excel file, as xlsxwriter doesn't create the second file.
from os.path import join, dirname, abspath
from xlrd.sheet import ctype_text
import csv
import os
import sys
import xlrd
import xlsxwriter
import xlwt
file_paths = sys.argv[1:]
draganddrop = ''.join(file_paths)
beginGrab = 0
counting = 0
endGrab = 1000
thousands = 0
if draganddrop == "":
fileName = raw_input("\nInput the file with extension\n>")
fileName = draganddrop
stopPoint = fileName.index('.')
prepRev = fileName[stopPoint:]
preName = fileName[:stopPoint]
if prepRev == ".csv":
excelFile = xlsxwriter.Workbook(preName + '.xlsx')
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
content = csv.reader(f)
for index_col, data_in_col in enumerate(content):
for index_row, data_in_cell in enumerate(data_in_col):
fileName = (preName + '.xlsx')
delMe = 1
print("Temporary Convert to xlsx done.\n")
stopPoint = fileName.index('.')
prepRev = fileName[0:stopPoint]
fname = join(dirname(abspath(__file__)), fileName)
xl_workbook = xlrd.open_workbook(fname)
sheet_names = xl_workbook.sheet_names()
xl_sheet = xl_workbook.sheet_by_name(sheet_names[0])
book = xlwt.Workbook(encoding="utf-8")
worksheet = book.add_sheet("Results", cell_overwrite_ok=True)
workbook = xlrd.open_workbook(fileName)
for sheet in workbook.sheets():
for row in range(sheet.nrows):
row = int(row)
subDivide = int(row) / 1000
while(thousands != subDivide + 1):
thousands = thousands + 1
counting = 0
totalName = preName + "_" + str(thousands) + ".xlsx"
excelFile = xlsxwriter.Workbook(str(totalName))
worksheet = excelFile.add_worksheet()
with open(totalName,'rb') as f:
col = xl_sheet.col_slice(0,1,10101010)
for idx, cell_obj in enumerate(col, start=beginGrab):
counting = counting + 1
if(counting == 1000):
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
cell_obj_str = str(cell_obj)
telePhone = (cell_obj_str[7:19])
worksheet.write(idx+1, 0, "1" + telePhone)
worksheet.write(0,0, "Telephone Number")
beginGrab = thousands * 1000
endGrab = beginGrab + 1000
excelFile = None
print("Mate, this is Tiny!")
print ("Ding! Job Done!")
I've been rubber ducking this and I can't find where I'm at fault.
By creating a sheet and then closing it, the program can then grasp it. I will probably make a git issue about this.
if prepRev == ".csv":
totalName = preName + '.xlsx'
excelFile = xlsxwriter.Workbook(totalName)
Closing it lets open see it while it still contains the same info.
excelFile = xlsxwriter.Workbook(totalName)
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
Doesn't the save/close line need to be within the while loop? Otherwise it looks like it will only save either the first/last item:
while(thousands != subDivide + 1):
# write file
that line is probably the reason why you cannot read back your file and your script crashes:
fname = join(dirname(abspath('__file__')), '%s' % fileName)
'__file__' shouldn't have quotes. I'd do:
fname = join(dirname(abspath(__file__)), fileName)
I need to remove sheets that names are in array. Unfortunetely this: tempWb.remove(wsToRemoveNameArray[wsToRemoveIndex]) , and this:
del tempWb[wsToRemoveNameArray[wsToRemoveIndex]] dont want to work with my code:
Anyone know how to deal with it?
def splitExcelFiles(InputPath, OutputPath, fileNameArray):
for file in range(0, len(fileNameArray)):
tempFile = InputPath + '\\' +fileNameArray[file]
tempWb = load_workbook(tempFile)
wsToRemoveNameArray = []
#new wb
tempWb.save(str(OutputPath) + '\\' + str(tempWb.sheetnames) + '.xlsx')
for ws in range (0,len(tempWb.sheetnames)):
newName = tempWb.sheetnames[ws]
wsToRemoveNameArray = []
#copyWs = tempWb.copy_worksheet[ws]
# #This section will save the names to remove other sheets from ws
for wsToRemoveName in range (0,len(tempWb.sheetnames)):
if newName != tempWb.sheetnames[wsToRemoveName]:
for wsToRemoveIndex in range (0, len(wsToRemoveNameArray)):
# tem
#del tempWb[wsToRemoveNameArray[wsToRemoveIndex]]
# tempWb.
# print(len(wsToRemoveNameArray))
tempWb.save(str(OutputPath) + '\\' + newName + '.xlsx')
First things first, some general tips:
Use the pathlib library whenever you deal with Paths. It makes things a lot easier. There is never a good reason to include the path delimiter in your code.
In python, it's common to write variables and functions with an underscore: save_path instead of savePath
Now that we have that out of the way, here is a tested example. Just change the directories to match yours.
from openpyxl import load_workbook, Workbook
from pathlib import Path
import shutil
def make_absolute_path(path, name, suffix=".xlsx"):
input_file_path = path / name
return input_file_path.with_suffix(suffix)
def remove_all_sheets_except_filename(input_path, output_path, filenames):
output_files_path = []
for i in range(0, len(filenames)):
input_file_path = make_absolute_path(input_path, filenames[i])
output_file_path = make_absolute_path(output_path, filenames[i])
if not Path.is_file(input_file_path):
print(f"Skipping {input_file_path}: Not valid file " f"path. ")
shutil.copyfile(input_file_path, output_file_path)
wb_source: Workbook = load_workbook(filename=output_file_path)
sheets = wb_source.worksheets
if len(sheets) == 1:
save_path = make_absolute_path(output_path, str(wb_source.sheetnames[0]))
for sheet in wb_source.sheetnames:
if not sheet == input_file_path.stem:
if len(wb_source.worksheets) == 1:
save_path = make_absolute_path(
output_path, str(wb_source.sheetnames[0])
f"Failed to process {input_file_path} with following "
f"sheets: {','.join(wb_source.worksheets)}."
raise ValueError("")
return output_files_path
def main():
# Adjust to where you have the xlsx files and where you want them
input_directory = Path("path/to/your/xlsx/files")
output_directory = Path("path/to/the/output/directory")
file_names = ["input", "foo", "bar"]
paths = remove_all_sheets_except_filename(
input_directory, output_directory, file_names
if __name__ == "__main__":
Hey I wanted to get quick Output for slicing my Source XLSX file for collect Data from Cell on index: 11, but my script seems to working very slow.
Expected output is Collected items from column index(11) Cell when on column index(16) Cell value = None. Script check from begin every row on column index(16) if value == None, but on my file i have thousands positions before script start collecting Data,
Can I speed up this process or find faster way?
import openpyxl
def getRowCount(file,sheetName):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
def getColumnCount(file,sheetName):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
def readData(file,sheetName,rownum,columnno):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
return sheet.cell(row=rownum, column=columnno).value
def writeData(file,sheetName,rownum,columno,data):
workbook = openpyxl.load_workbook(file)
sheet = workbook.get_sheet_by_name(sheetName)
sheet.cell(row=rownum, column=columno).value = data
My Script:
import pandas as pd
import XLUtils
from openpyxl import Workbook
from datetime import datetime
#This function Create product list file for current day
i = 1
i_range = 50
r = 1
x = 1
rows = 50
LISTED_DATE = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r,16)
if LISTED_DATE == None:
ASIN = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r,11)
wb.save('Product_'+TODAY_DATE + '.xlsx')
print('File has been created: ',FILE)
for r in range(2,rows+1):
CHECK_ASIN = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r, 11)
if CHECK_ASIN == None:
print('No more ASIN avaiable')
FE = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r, 16)
if FE != None:
print('Product last added: ',FE)
if FE == None:
ASIN = XLUtils.readData(PRODUCT_RESEARCH,'Sheet1',r,11)
print(f'ASIN nr. {i}: {ASIN}')
if i >= i_range:
print(f'List of {i_range} items, has been Created.')
print('Error: product on the list')
XLUtils.writeData(FILE,'Sheet',x,1,' ')
print('Created list:\n',LIST)
ALL_ITEMS = (TODAY_DATE + '.xlsx')
print('CSV file has been named: ', ALL_ITEMS)
DATA_XLS = pd.read_excel(FILE, 'Sheet', dtype=str, index_col=None)
DATA_XLS.to_csv(PRODUCT_NAME+'.csv', encoding='utf-8', index=False)
if __name__ == '__main__':
#---Product_list_generator ---# Variable --
wb = Workbook()
LIST = []
TODAY_DATE = datetime.today().strftime('%d_%m_%Y')
TODAY_DATE_XLSX = datetime.today().strftime('%d/%m/%Y')
PRODUCT_RESEARCH = ('Product_Research_copy2.xlsx') #<--- xlsx File
FILE = ('Product_'+TODAY_DATE + '.xlsx')
I have over 100K CSV (total file size north of 150 GB) which I need to join. All have standard column names although the sequence of columns may not match and some csv have a few columns missing.
Now I just created a dataframe and kept concating the datframe from each csv in each iteration to have a standard dataframe containing all columns which I eventually intended to save as csv
I tried making a dataframe with 1000 sample csv and noticed as the dataframe size increased, the number of iteration dropped down from 10 to 1.5 per second which probably means that it would follow a similar trend if I got all-in with 100k csv thus taking days if not months to combine them.
Is there a better way of combining huge number of csv files?
Here is my code
df_t1 = pd.DataFrame()
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
if thisCSV.endswith('type1.csv'):
df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Possible improvement
Method 1: Using Pandas
#df_t1 = pd.DataFrame()
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
if thisCSV.endswith('type1.csv'):
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
#df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
df["File Name"] = thisCSV # places thisCSV in every row
#df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True) # concat slow, append to list instead
df_t1 = pd.concat(df_t1_lst, ignore_index=True) # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Method 1a
Using Pandas to continuously append to CSV output file
import os
import pandas as pd
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
write_header = True
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]
output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
pass # create empty output file
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
df = pd.read_csv(input_file)
if df.shape[0] > 0:
df['File Name'] = thisCSV # Add filename
df = df.sort_index(axis = 1) # sort based upon colunn in ascending order
# Append to output file
df.to_csv(output_file, mode='a',
index = False,
header= write_header)
write_header = False # Only write header once
del df
Method 2: Binary Files
Reading/Writing binary and using memory-map should be faster.
from tqdm import tqdm
import os
import mmap
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
header = None
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
with open(outpath + "df_t1.csv", "wb") as ofile:
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
with open(input_file, "rb") as ifile:
print('file ', thisCSV)
# memory-map the file, size 0 means whole file
with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
text_iter = iter(mmap_obj.read().split(b'\n'))
if header is None:
header = next(text_iter)
header = header.rstrip() + str_to_bytes(SEPARATOR + "File Name\n")
ofile.write(header) # write header
next(text_iter) # ignore header row
# write data to output file
file_value = str_to_bytes(SEPARATOR + f"{thisCSV}\n")
for line in text_iter:
if line.strip(): # skip blank lines
ofile.write(line.rstrip() + file_value)
I have a series of CSV file like this one. I’m trying to convert and merge them into an xlsx file with python and openpyxl with this code:
import csv
import openpyxl
import glob
csvpath = 'C:/Users/Lorenzo/Downloads/CSV/'
csvfiles = glob.glob(csvpath + '*.csv')
data = input('Inserisci data Simulazione: ')
destinationfilepath = 'C:/Users/Lorenzo/Desktop/Simulazione_' + data + '.xlsx'
wb = openpyxl.Workbook()
for i in range(len(csvfiles)):
filename = csvfiles[i]
reader = csv.reader(open(filename), delimiter=',')
csvname = filename[len(csvpath):-4]
ws1 = wb.create_sheet(csvname)
for row in reader:
if k==0:
for cell in row:
c= ws1.cell(row=k, column=g)
c.value = float(cell)
ws1['A1'] = 'Iteration'
ws1['B1'] = 'CD'
ws1['C1'] = 'CL'
ws1['D1'] = 'CL_F'
ws1['E1'] = 'CL_R'
ws1['F1'] = 'CM'
sheet = wb['Sheet']
The code runs but in most cells (and strangely enough not in all cells) I get the error “number stored as text” despite using the command float like suggested in this and similar topics.
What is that I'm doing wrong?
I'm trying to replicate the exporting of a Code Module from an Excel sheet in Python.
The following works in VBA:
Public Sub ExportModules()
Dim wb As Workbook
Set wb = ThisWorkbook
Dim D As String
Dim N
D = ThisWorkbook.Path
For Each VBComp In wb.VBProject.VBComponents
If (VBComp.Type = 1) Then
N = D + "\" + VBComp.Name + ".txt"
VBComp.Export N
End If
End Sub
And I have the following in Python:
import os
import sys
import glob
from win32com.client import Dispatch
scripts_dir = 'folder address'
com_instance = Dispatch("Excel.Application")
com_instance.Visible = False
com_instance.DisplayAlerts = False
for script_file in glob.glob(os.path.join(scripts_dir, "*.xlsm")):
print "Processing: %s" % script_file
(file_path, file_name) = os.path.split(script_file)
objworkbook = com_instance.Workbooks.Open(script_file)
for xlmodule in objworkbook.VBProject.VBComponents:
xlmodule.Export('export file name')
My question is, what do I have to do in Python to replicate the Export of the file as per the VBA code?
Use the default oletools xltrails provides a good way to extract .bas files from .xlsm or other excel files
import os
import shutil
from oletools.olevba3 import VBA_Parser
EXCEL_FILE_EXTENSIONS = ('xlsb', 'xls', 'xlsm', 'xla', 'xlt', 'xlam',)
def parse(workbook_path):
vba_path = workbook_path + '.vba'
vba_parser = VBA_Parser(workbook_path)
vba_modules = vba_parser.extract_all_macros() if vba_parser.detect_vba_macros() else []
for _, _, _, content in vba_modules:
decoded_content = content.decode('latin-1')
lines = []
if '\r\n' in decoded_content:
lines = decoded_content.split('\r\n')
lines = decoded_content.split('\n')
if lines:
name = lines[0].replace('Attribute VB_Name = ', '').strip('"')
content = [line for line in lines[1:] if not (
line.startswith('Attribute') and 'VB_' in line)]
if content and content[-1] == '':
lines_of_code = len(content)
non_empty_lines_of_code = len([c for c in content if c])
if non_empty_lines_of_code > 0:
if not os.path.exists(os.path.join(vba_path)):
with open(os.path.join(vba_path, name + '.bas'), 'w') as f:
if __name__ == '__main__':
for root, dirs, files in os.walk('.'):
for f in dirs:
if f.endswith('.vba'):
shutil.rmtree(os.path.join(root, f))
for f in files:
parse(os.path.join(root, f))
I have tried it and it works great.
Ref: https://www.xltrail.com/blog/auto-export-vba-commit-hook