Write CSV content to Excel produces empty sheets - python

Writing DataFrame to excel file leaves sheets with zero data.
I am creating a Robotics "Scouting application". It receives multiple .csv files throughout the coarse of two days. The csv files will be named with a four digit number plus a hyphen and then a match number. For example "2073-18.csv". Multiple files for each team will arrive. I need one sheet for each team with the content of each csv file on the same sheet for that team. Creating the sheets works, writing the data to these sheets doesn't.
import os
import glob
import csv
from xlsxwriter.workbook import Workbook
import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
from openpyxl import load_workbook
os.chdir ("/EagleScout")
path = '.'
extension = 'csv'
engine = 'xlsxwriter'
files_in_dir = [ f for f in glob.glob('*.csv')]
workbook = Workbook('Tournament.xlsx')
with pd.ExcelWriter('Tournament.xlsx') as writer:
for csvfile in files_in_dir:
df = pd.read_csv(csvfile)
fName, fExt = (os.path.splitext(csvfile))
sName = fName.split('-')
worksheet = workbook.get_worksheet_by_name(sName [0])
if worksheet is None:
worksheet = workbook.add_worksheet(sName [0]) #workseet with csv file name
df.to_excel(writer, sheet_name = (sName[0]))
writer.save()
workbook.close()
What I need is one workbook with one sheet for each team, up to 70 teams. Each sheet will have multiple rows, one for each csv file that arrived for that team. The question is, how do I get Pandas, or other libraries, to write the content of each csv file to it's appropriate sheet in the workbook?

OK, with the input from #ivan_pozdeev, I finally got past my issues.
Remember, my original desire was to generate a script that could be run on a regular basis and generate a spreadsheet with multiple worksheets. Each worksheet would contain all the data from the .csv files for every match that had played, and grouped by the team number.
I have also added a single spreadsheet that contains the raw data.
Here is what I came up with:
import os
import glob
import csv
import xlsxwriter
from xlsxwriter.workbook import Workbook
import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
#import openpyxl
#from openpyxl import load_workbook
os.chdir ("/EagleScout")
path = '.'
extension = 'csv'
# Remove the combined .csv file from previous runs
#This will provide clean date without corruption from earlier runs
if os.path.exists('./Spreadsheets/combined.csv'):
os.remove ('./Spreadsheets/combined.csv')
#Remove previous Excel spreadsheet
if os.path.exists('./Spreadsheets/Tournament.xlsx'):
os.remove ('./Spreadsheets/Tournament.xlsx')
#Remove sorted combined csv
#Remove previous Excel spreadsheet
if os.path.exists('./Spreadsheets/Combined.xlsx'):
os.remove ('./Spreadsheets/Combined.xlsx')
#Read in and merge all .CSV file names
files_in_dir = [ f for f in glob.glob('*.csv')]
#Create a single combined .csv file with all data
#from all matches completed so far.
d1 = pd.read_csv('Header.txt')
d1.to_csv('./Spreadsheets/combined.csv', header = True, index = False)
for filenames in files_in_dir:
df = pd.read_csv(filenames)
fName, fExt = (os.path.splitext(filenames))
sName = fName.split('-')
N=(sName[1])
df.insert(0,N,N,True)
df.to_csv('./Spreadsheets/combined.csv', index_label = (sName[0]), mode = 'a')
#Combine all csv files into one master Raw Excel Data file
#and add column headers as labels
with pd.ExcelWriter('./Spreadsheets/Combined.xlsx') as writer:
dt = pd.read_csv('./Spreadsheets/combined.csv')
dt.to_excel(writer, sheet_name = 'All data')
writer.save()
#Parse through all .CSV files and append content to appropriate team worksheet.
with pd.ExcelWriter('./Spreadsheets/Tournament.xlsx') as writer:
df2 = pd.read_excel('./Spreadsheets/Combined.xlsx')
group = df2.groupby('Team')
for Team, Team_df in group:
Team_df.to_excel(writer, sheet_name = str(Team))
writer.save()
I am certain there is a cleaner way to do this code, I'm still new at this, but for now it does what I expect.

Related

Merge Multiple Excel files having multiple sheets to One Excel file,

I am trying to do this
Multiple Excel files having multiple sheets to One Excel file, having merged data in multiple sheets.
if all files have sheets "A",B and C, data from all Sheets should concate in a single file under the same sheet names. I wrote the below code but I am failing; I get the error 'NoneType' object has no attribute 'to_excel'
import pandas as pd
#return all file paths that match a specific pattern in our case we want all *.xlsx
import glob
import os
import openpyxl
import xlrd
#reading excel files folder
location = r"C:\-----Desktop\python\Input/*.xlsx"
excel_files = glob.glob(location)
with pd.ExcelWriter(r"C:\---on\Output\filemergetest.xlsx") as writer: # excel writing for multple sheets
for files in excel_files:
sheet = os.path.basename(files) #simplying file name
sheet = sheet.split(".")[0] #simplying file name
list_of_dfs = []
# Iterate through each worksheet
for sheet in excel_files:
fi = pd.ExcelFile(sheet)
for sh in fi.sheet_names:
# Parse data from each worksheet as a Pandas DataFrame
dfC = fi.parse(sheet_name=0)
# And append it to the list
data2 = list_of_dfs.append(dfC)
# Combine all DataFrames into one
#data2 = pd.concat(list_of_dfs,ignore_index=True)
# Preview first 10 rows
data2
data2.to_excel(r"C:----python\Output\jointsheetsTAA.xlsx",index=False)

Copy Data from multiple excel files to existing file on specific sheet

I would like to copy the data from multiple excel files and paste it existing excel file.
import os
import openpyxl
import pandas as pd
my_path = r'C://Users//greencolor//Autoreport//Load_attachments//' # Updating every monday with new files
my_path1 =r'C://Users//greencolor//Desktop//Autoreport//' # place of Master.xlsx
for filename in os.listdir(my_path): #loop though the downloaded files
if filename.startswith('PB orders Dec'): #finds the file with the name PB orders December.xlsb
dec = pd.read_excel(os.path.join(my_path, filename),
sheet_name='Raw data ',
engine='pyxlsb') #reading the specific sheet named Raw data
with pd.ExcelWriter(my_path1 + '//Maaster.xlsx', mode='a', engine=openpyxl) as writer:
dec.to_excel(writer, sheet_name='DecData', index=False) #copyes the data from PB orders December.xlsb and paste it in Master file on sheet Decdata without deleting other sheets. Basically updating all data with new data.
My problem is that I would like to do the same above mentioned operation on multiple files. For example, if in the Load_attachments is the file named PB orders November.xlsb I want to apply the same code to that November file and so on what ever month name file has.
You can put the entire thing (minus imports) in a for loop and feed it with an array of filename beginnings:
import os
import openpyxl
import pandas as pd
fnamebegin_array = ['PB orders Jan','PB orders Feb','PB orders Mar','etc']
sheetname_array = ['JanData','FebData','MarData','etc']
for i in range(len(fnamebegin_array)): # would be 12, if there is data for each month in a year
my_path = r'C://Users//greencolor//Autoreport//Load_attachments//'
my_path1 =r'C://Users//greencolor//Desktop//Autoreport//'
for filename in os.listdir(my_path):
if filename.startswith(fnamebegin_array[i]):
month = pd.read_excel(os.path.join(my_path, filename),
sheet_name='Raw data ',
engine='pyxlsb')
with pd.ExcelWriter(my_path1 + '//Master.xlsx', mode='a', engine=openpyxl) as writer:
month.to_excel(writer, sheet_name=sheetname_array[i], index=False)

How to delete duplicate columns in multiple Excel sheets of one workbook?

I have multiple sheets in one Excel workbook with duplicated columns in each sheet. I need to delete the duplicates and to leave the original columns only.
I know how to drop duplicates within a sheet.
df_sheet_map['> Acute Hospital Bed SLM']
result2=df_sheet_map['> Acute Hospital Bed SLM'].T.drop_duplicates().T
dfList = []
path = 'J:/TestDup'
newpath = 'J:/TestDup/Test2'
for fn in os.listdir(path):
file = os.path.join(path, fn)
if os.path.isfile(file):
# Import the excel file and call it xlsx_file
xlsx_file = pd.ExcelFile(file)
# View the excel files sheet names
xlsx_file.sheet_names
# Load the xlsx files Data sheet as a dataframe
df = xlsx_file.parse('Sheet1',header= None)
df_NoHeader = df[2:]
data = df_NoHeader
# Save individual dataframe
data.to_excel(os.path.join(newpath, fn))
dfList.append(data)
appended_data = pd.concat(dfList)
appended_data.to_excel(os.path.join(newpath, 'master_data.xlsx'))
The above code is working. However, I need to traverse all sheets. Further, it shows to delete first two rows, I need to change to delete duplicates.
#Transpose all sheets in a workbook. then delete duplicates. then Transpose back to original file and save all sheets
#Transpose all sheets in the workbook file
import pyexcel
import pyexcel_xlsx as pe
from pyexcel_xlsx import get_data
book = pyexcel.get_book(file_name="H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
for sheet in book:
sheet.transpose()
pass
book.save_as("H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
#run excel VB from python
import win32com.client as win32
import time
xl = win32.Dispatch('Excel.Application')
xl.Visible = 0
ss = xl.Workbooks.Open('H:/SLM_Final/DeleteDup.xlsm')
xl.Run("deleteDuplicate")
time.sleep(30)
xl.Quit()
time.sleep(30)
#VB syntax to add on excel workbook
'''Sub deleteDuplicate()
Dim ws As Worksheet
Dim wkbk1 As Workbook
Dim w As Long
Dim lRow As Long
Dim iCntr As Long
Set wkbk1 = Workbooks.Open("H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
'Set wkbk1 = ThisWorkbook
wkbk1.Activate
With wkbk1
For w = 1 To .Worksheets.Count
With Worksheets(w)
.UsedRange.RemoveDuplicates Columns:=Array(3, 4), Header:=xlYes
End With
Next w
End With
wkbk1.Save
wkbk1.Close
End Sub''''
#
#Transpose files back to the original shape
import pyexcel
import pyexcel_xlsx as pe
from pyexcel_xlsx import get_data
book = pyexcel.get_book(file_name="H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
for sheet in book:
sheet.transpose()
#sheet.delete_duplicates(keep=False, inplace=False)
pass
book.save_as("H:/SLM_Final/SLM Indicator template Main to clean.xlsx")
I hope this will help.

Read excel files from a folder, extract a cell from sheet-1 and append values to a new excel sheet

I have around 100 excel files in a folder. I need to extract a cell, say name column D6 from the sheet-1 of the excel files and output the same to a new excel file/sheet. I have a followed a few SO questions but have not been able to find the desired output. This is what my issue is when I run the below program`
TypeError: cannot concatenate a non-NDFrame object
`
import os
import pandas as pd
import xlrd
import xlwt
files = os.listdir(path)
files
all_data = pd.DataFrame()
for file in files:
wb = xlrd.open_workbook(file)
sheet = wb.sheet_by_index(0)
df = sheet.cell_value(5,3)
all_data.append(df,ignore_index=True)
writer = pd.ExcelWriter('output.xlsx', engine='xlsxwriter')
all_data.to_excel(writer,'sheet1')
writer.save()
Your error says that you can only concat a dataframe with another dataframe. when you read the cell with xlrd you don't get a df-object. so either make the single cell a dataframe or store it temorarily and make the dataframe afterwards.
something like this (untested) should do it.
all_data = [] # list
for file in files:
df = pd.read_excel(file, sheetname='sheet-1')
all_data.append(df.iloc[5,3])
all_data = pd.DataFrame(all_data) # dataframe
all_data.to_excel('all_data.xlsx')
or one could use other libraries as well to make the same, like openpyxl for example.

Converting multiple xls files to xlsx- issues with scaling up from single file

We have a few thousand xls files, with dozens of sheets in each file. We are working on a larger project to combine the files and sheets, but first need to convert them to xlsx.
The following code works fine on a single file:
import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook
xlsBook = xlrd.open_workbook(C://path)
workbook = openpyxlWorkbook()
for i in xrange(0, xlsBook.nsheets):
xlsSheet = xlsBook.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xlsSheet.name
for row in xrange(0, xlsSheet.nrows):
for col in xrange(0, xlsSheet.ncols):
sheet.cell(row=row+1, column=col+1).value = xlsSheet.cell_value(row, col)
workbook.save(c://path/workbook.xlsx")
This works perfectly.
When attempting to loop through all files, we use:
import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook
import glob
import pandas as pd
from pandas import ExcelWriter
import os
path ="C://path"
path2 = "C://path2"
allFiles = glob.glob(path + "/*.xls")
for file_ in allFiles:
xlsBook = xlrd.open_workbook(file_)
workbook = openpyxlWorkbook()
for i in xrange(0, xlsBook.nsheets):
xlsSheet = xlsBook.sheet_by_index(i)
sheet = workbook.active if i == 0 else workbook.create_sheet()
sheet.title = xlsSheet.name
for row in xrange(0, xlsSheet.nrows):
for col in xrange(0, xlsSheet.ncols):
sheet.cell(row=row+1, column=col+1).value = xlsSheet.cell_value(row, col)
##workbook.save(os.path.join(path2,file_))
##workbook.to_excel(os.path.join(path2,file_))
workbook.save("C://path/workbook.xlsx")
For the first two commented out save methods, workbook.save seems to do absolutely nothing, and to_excel tells me workbook does not have a property called to_excel...is that because I didn't call pandas in the loop?
The final workbook.save was a test- I assumed it would save the final iteration of the loop correctly, since it worked in the script with just one file.
Instead, it creates the file, with all of the worksheets correctly named, but no data in any of the worksheets.
Any idea what I am missing? To be clear, I am looking to have each file named with its original filename at the end of the loop, and a valid xlsx extension.
I'd try this way instead. Simpler code and it worked when I tested it.
import pandas as pd
import glob
def converter(filename):
xl = pd.ExcelFile(filename) # reads file in
sheet_names = xl.sheet_names # gets the sheet names of the file
sheets_dict = {} # dictionary with sheet_names as keys and data as values
for sheet in sheet_names:
sheets_dict[sheet] = xl.parse(sheet)
writer = pd.ExcelWriter(r'C:\Users\you\Desktop\\' + filename.split('\\')[-1][:-4] + '.xlsx') # takes the file path and only returns the file name, now with format xlsx
for sheet_name, data in sheets_dict.iteritems():
data.to_excel(writer, sheet_name, index = False)
writer.save()
files = glob.glob(r'C:\Users\you\Desktop' + '\*.xls')
for file in files:
converter(file)
Edit: I'm not too familiar with openpyxl but I don't believe it has a .to_excel method. I think you were creating a openpyxl workbook but then trying to save it using a pandas method.

Categories

Resources