How to copy multiple .xlsx files into a respective .csv file? - python

I have 24 excel files, I'm aiming to copy the .xslx data and to their respective 24 .csv files. I have copied the data over however its creating 10 copies in the .csv files, I believe it has something to do with the for loops. Ive tried to use writerow() rather than writerows() yet that does help. I'm trying to understand openpyxl and its writer and reader objects.
import openpyxl, os, csv
from pathlib import Path
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
wb = openpyxl.load_workbook('./excelspreadsheets/' + excelFile)
for sheetName in wb.sheetnames:
# Loop through every sheet in the workbook
sheet = wb[sheetName]
sheetTitle = sheet.title
# Create the CSV filename from the Excel filename and sheet title
p = Path(excelFile)
excelFileStemName = p.stem
CsvFilename = excelFileStemName + '_' + sheetTitle + '.csv'
# Create the csv.writer object for this CSV file
print(f'Creating filename {CsvFilename}...')
outputFile = open(CsvFilename, 'w', newline='')
outputWriter = csv.writer(outputFile)
# Create reader object for each excel sheet
fileObj = open('./excelspreadsheets/' + excelFile)
fileReaderObj = csv.reader(fileObj)
# Loop through every row in the excel sheet
for rowNum in range(1, sheet.max_row + 1):
rowData = [] # append each cell to this list
# Loop through each cell in the row
for colNum in range(1, sheet.max_column + 1):
rowData.append(sheet.values)
# write the rowData list to the CSV file.
for row in rowData:
outputWriter.writerows(row)
outputFile.close()
So, each of the newly created .csv files writes the correct data but does it 10 times, rather than once.
Appreciate any feedback thanks.

You can use read_excel and to_csv, which come as part of pandas to read excel file and write the data to csv file. It is just simpler from coding perspective, as the read and write will be done in one line. It also uses Openpyxl underneath. The updated code is below.
import openpyxl, os, csv
from pathlib import Path
import pandas as pd
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
xls = pd.ExcelFile('./excelspreadsheets/' + excelFile)
for sheetname in xls.sheet_names:
#Read each sheet into df
df = pd.read_excel('./excelspreadsheets/' + excelFile, sheetname)
#Remove .xlsx from filename and create CSV name
CsvFilename = excelFile.rstrip('.xlsx') + '_' + sheetname + '.csv'
print(f'Creating filename {CsvFilename}...')
#Write df as CSV to file
df.to_csv(CsvFilename, index=False)
Let me know if you see any errors...

Related

Python Pandas csv files to Excel worksheets - Cleanup

I want to take multiple .csv files and convert them to Excel worksheets in one workbook, specifically using Pandas.
I finally got this to work, but I know the code itself is of poorly written.
Any suggestions on how to clean this up?
"Beautify is better than Ugly"
Here is the code:
import pandas as pd
import os
import openpyxl as xl
directory = os.path.join(os.curdir, "data/")
new_xl_file_path = "csv_merge.xlsx"
new_xl_file = xl.Workbook() # Create a new Excel workbook
new_xl_file.save(new_xl_file_path)
name_list = os.listdir(directory) # file1.csv, file2.csv, file3.csv, etc...
full_path_list = [] # For reading with pd.read_csv()
data_frame_list = [] # List to save .csv dataframes
for filename in os.listdir(directory):
f = os.path.join(directory, filename) # Get full path name
df = pd.read_csv(f)
data_frame_list.append(df)
counter = 0
with pd.ExcelWriter(new_xl_file_path) as writer:
for dataframe in data_frame_list:
dataframe.to_excel(writer, index=False, sheet_name=name_list[counter])
counter += 1

Copy Data from different workbook into Master Workbook with Python

I have to copy data from different workbooks and paste it into a master workbook. All the workbooks are located in a folder: C:\Users\f65651\data transfer. The copied data should be merged into one and then overwritten into the Master wkbk cells. Subsequently also, data from updated workbooks should be overwritten in the Master wkbk.
After some help, I have been able to incorporate all the excel workbooks together
import openpyxl as xl
import os
path1 ='C:\\Users\\f65651\Rresult.xlsx' #Master workbook
wb1 = xl.load_workbook(filename=path1)
ws1 = wb1.worksheets[0]
#iterating over the workbooks
for filename in os.listdir(directory):
if filename.endswith(".xlsx"):
g= os.path.join(directory, filename)
f =xl.load_workbook(filename=g)
f1 = f.worksheets[0]
print (filename, f1)
for row in f1:
values=[cell.value for cell in row]
ws1.append(values)
wb1.save(path1)
print ('Process finished!')
However with this code above, the data is appended under the Master wkbk existing table format instead of being overwritten directly into the cells
I have tried fixing this issue but i dont know how. I feel i am not doing the copying of the workbooks into the Master wkbk right. I also dont want to lose the formatting in the Master sheet. Please help!
For better understanding of the problem, I have attached a snippet of what i am trying to achieve, Data 1&2 are examples of the workbks and the Result file is the master sheet.
https://i.stack.imgur.com/0G4lM.png
from openpyxl import load_workbook
import os
directory = "workbooks"
master = Workbook()
master_sheet = master.active
master_sheet.title = "master_sheet"
for filename in os.listdir(directory):
if filename.endswith(".xlsx"):
file_path = os.path.join(directory, filename)
sheet = load_workbook(file_path).active
# Read each column's value of each excel sheet starting from row 3
for index, row in enumerate(sheet.iter_rows()):
if (index <= 1):
for cell in row:
master_sheet[cell.coordinate].value = cell.value
else:
row_dict = {cell.coordinate[:1]:cell.value for cell in row}
master_sheet.append(row_dict)
master.save("sheet3.xlsx")

Convert multiple csv files to Excel files?

In Python 2.7, I have a lot of csv files I want to convert to Excel.
The names of the csv files are abcd1.csv, abcd2.csv and so on.
I want to convert them to abcd1.xls, abcd2.xls and so on.
While I am able to do it on one file, I don't know how to do it on multiple files.
This is the function I have used so far:
from openpyxl import Workbook
import csv
wb = Workbook()
ws = wb.active
file_name = "COUNT16_DISTRIBUTION" + str(count3*1) + ".csv"
with open(file_name, 'r') as f:
for row in csv.reader(f):
ws.append(row)
wb.save()
the file_name can be used in a while loop and I can go through each csv file but I dont know how to save them as .xls.
Here is an example with pandas:
import pandas as pd
import os
# Create function that converts csv 2 excel
def csv2excel(filepath, sep=','):
df = pd.read_csv(filepath, sep=sep)
newpath = os.path.splitext(filepath)[0] + '.xlsx'
df.to_excel(newpath, index=False)
# Loop through files and call the function
for f in os.listdir('.'):
if f.endswith('.csv') and f.startswith('abcd'):
csv2excel(f)

Copy specific lines from multiple text files to an excel file

I have as many as 1500 text files and I want to copy 5 lines from every text file, say line 4,5,9,14 and 32. I want to make columns of these files in an excel sheet one below the other, of the 1500 text files. I have figured out a code that takes in only one txt file but copies all the data into rows. Any help will be appreciated.
Here is my code:
import csv
import xlwt
import os
import sys
# Look for input file in same location as script file:
inputfilename = os.path.join(os.path.dirname(sys.argv[0]),
'C:/path/filename.txt')
# Strip off the path
basefilename = os.path.basename(inputfilename)
# Strip off the extension
basefilename_noext = os.path.splitext(basefilename)[0]
# Get the path of the input file as the target output path
targetoutputpath = os.path.dirname(inputfilename)
# Generate the output filename
outputfilename = os.path.join(targetoutputpath, basefilename_noext + '.xls')
# Create a workbook object
workbook = xlwt.Workbook()
# Add a sheet object
worksheet = workbook.add_sheet(basefilename_noext, cell_overwrite_ok=True)
# Get a CSV reader object set up for reading the input file with tab
delimiters
datareader = csv.reader(open(inputfilename, 'rb'),
delimiter='\t', quotechar='"')
# Process the file and output to Excel sheet
for rowno, row in enumerate(datareader):
for colno, colitem in enumerate(row):
worksheet.write(rowno, colno, colitem)
# Write the output file.
workbook.save(outputfilename)
# Open it via the operating system (will only work on Windows)
# On Linux/Unix you would use subprocess.Popen(['xdg-open', filename])
os.startfile(outputfilename)
You would first need to put all of your required text files in the current folder, glob.glob('*.txt') could then be used to get a list of these filenames. For each text file, read the files in using readlines() and extract the required lines using itemgetter(). For each file, create a new row in your output worksheet and write each line as a different column entry.
import xlwt
import glob
import operator
# Create a workbook object
wb = xlwt.Workbook()
# # Add a sheet object
ws = wb.add_sheet('Sheet1', cell_overwrite_ok=True)
rowy = 0
for text_filename in glob.glob('*.txt'):
with open(text_filename) as f_input:
try:
lines = [line.strip() for line in operator.itemgetter(4, 5, 9, 14, 32)(f_input.readlines())]
except IndexError as e:
print "'{}' is too short".format(text_filename)
lines = []
# Output to Excel sheet
for colno, colitem in enumerate(lines):
ws.write(rowy, colno, colitem)
rowy += 1
# Write the output file.
wb.save('output.xls')

Converting multiple xls files to xlsx- issues with scaling up from single file

We have a few thousand xls files, with dozens of sheets in each file. We are working on a larger project to combine the files and sheets, but first need to convert them to xlsx.
The following code works fine on a single file:
import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook
xlsBook = xlrd.open_workbook(C://path)
workbook = openpyxlWorkbook()
for i in xrange(0, xlsBook.nsheets):
xlsSheet = xlsBook.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xlsSheet.name
for row in xrange(0, xlsSheet.nrows):
for col in xrange(0, xlsSheet.ncols):
sheet.cell(row=row+1, column=col+1).value = xlsSheet.cell_value(row, col)
workbook.save(c://path/workbook.xlsx")
This works perfectly.
When attempting to loop through all files, we use:
import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook
import glob
import pandas as pd
from pandas import ExcelWriter
import os
path ="C://path"
path2 = "C://path2"
allFiles = glob.glob(path + "/*.xls")
for file_ in allFiles:
xlsBook = xlrd.open_workbook(file_)
workbook = openpyxlWorkbook()
for i in xrange(0, xlsBook.nsheets):
xlsSheet = xlsBook.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xlsSheet.name
for row in xrange(0, xlsSheet.nrows):
for col in xrange(0, xlsSheet.ncols):
sheet.cell(row=row+1, column=col+1).value = xlsSheet.cell_value(row, col)
##workbook.save(os.path.join(path2,file_))
##workbook.to_excel(os.path.join(path2,file_))
workbook.save("C://path/workbook.xlsx")
For the first two commented out save methods, workbook.save seems to do absolutely nothing, and to_excel tells me workbook does not have a property called to_excel...is that because I didn't call pandas in the loop?
The final workbook.save was a test- I assumed it would save the final iteration of the loop correctly, since it worked in the script with just one file.
Instead, it creates the file, with all of the worksheets correctly named, but no data in any of the worksheets.
Any idea what I am missing? To be clear, I am looking to have each file named with its original filename at the end of the loop, and a valid xlsx extension.
I'd try this way instead. Simpler code and it worked when I tested it.
import pandas as pd
import glob
def converter(filename):
xl = pd.ExcelFile(filename) # reads file in
sheet_names = xl.sheet_names # gets the sheet names of the file
sheets_dict = {} # dictionary with sheet_names as keys and data as values
for sheet in sheet_names:
sheets_dict[sheet] = xl.parse(sheet)
writer = pd.ExcelWriter(r'C:\Users\you\Desktop\\' + filename.split('\\')[-1][:-4] + '.xlsx') # takes the file path and only returns the file name, now with format xlsx
for sheet_name, data in sheets_dict.iteritems():
data.to_excel(writer, sheet_name, index = False)
writer.save()
files = glob.glob(r'C:\Users\you\Desktop' + '\*.xls')
for file in files:
converter(file)
Edit: I'm not too familiar with openpyxl but I don't believe it has a .to_excel method. I think you were creating a openpyxl workbook but then trying to save it using a pandas method.

Categories

Resources