I am new to Python programming and seeking for some help/guidance in correcting my python code.
Here my query is.
I have one Excel file which has (7 Tabs).
I have one Folder which contains 7 different text file and each text file contains respective Tab SQL Query and each text file name is same as the Tab Name which is available in Excel File.
I have a written a Python code to loop through all the text file one by one and execute that each text file SQL query and whatever data will come in output that output data should dump into existing excel file in that respective sheet/tab. i am using pandas to do this, however, code is working fine but while updating data into excel pandas is removing all existing sheets from the file and updating only current output data into excel file.
Example: if Python code execute a text file(Filename: Data) and after executing this SQL query we got some data and this data should dump into excel file (sheetname: Data).
<pre><code>
import pypyodbc
import pandas as pd
import os
import ctypes
from pandas import ExcelWriter
fpath = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries"
xlfile = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries\Open_Case_Data.xlsx"
cnxn = pypyodbc.connect('Driver={SQL Server};Server=MyServerName;Database=MyDatabaseName;Trusted_Connection=Yes')
cursor = cnxn.cursor()
for subdir, dirs, files in os.walk(fpath):
for file in files:
#print(os.path.join(subdir,file))
filepath = os.path.join(subdir,file)
#print("FilePath: ", filepath)
if filepath.endswith(".txt"):
if file != "ClosedAging_Cont.txt":
txtdata = open(filepath, 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
if file == "ClosedAging.txt":
txtdata = open(os.path.join(subdir,"ClosedAging_Cont.txt"), 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
col = [desc[0] for desc in cursor.description]
data = cursor.fetchall()
df = pd.DataFrame(list(data),columns=col)
#save_xls(df,xlfile)
writer = pd.ExcelWriter(xlfile)
flnm = file.replace('.txt','').strip()
df.to_excel(writer,sheet_name=flnm,index=False)
writer.save()
print(file, " : Successfully Updated.")
else:
print(file, " : Ignoring this File")
else:
print(file, " : Ignoring this File")
ctypes.windll.user32.MessageBoxW(0,"Open Case Reporting Data Successfully Updated","Open Case Reporting",1)
</pre></code>
By looping through the text files, you overwrite the Excel file inside the loop each time. Instead instantiate pd.ExcelWriter(xlfile) and call writer.save() outside the loop.
The following example is adapted from the xlswriter documentation
You can find more information about multiple sheets here: xlswriter documentaion - multiple sheets
import pandas as pd
# Create a Pandas Excel writer using XlsxWriter as the engine outside the loop.
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
# Sample loop, replace with directory browsing loop
for i in range(7):
# Sample Pandas dataframe. Replace with SQL query and resulting data frame.
df = pd.DataFrame({'DataFromSQLQuery': ['SQL query result {0}'.format(i)]})
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet{0}'.format(i))
# Close the Pandas Excel writer and output the Excel file.
writer.save()
The following code addresses the concrete question but is untested.
import pypyodbc
import pandas as pd
import os
import ctypes
from pandas import ExcelWriter
fpath = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries"
xlfile = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries\Open_Case_Data.xlsx"
cnxn = pypyodbc.connect('Driver={SQL Server};Server=MyServerName;Database=MyDatabaseName;Trusted_Connection=Yes')
cursor = cnxn.cursor()
# Create a Pandas Excel writer using XlsxWriter as the engine outside the loop
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
# File loop
for subdir, dirs, files in os.walk(fpath):
for file in files:
filepath = os.path.join(subdir,file)
if filepath.endswith(".txt"):
if file != "ClosedAging_Cont.txt":
txtdata = open(filepath, 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
if file == "ClosedAging.txt":
txtdata = open(os.path.join(subdir,"ClosedAging_Cont.txt"), 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
col = [desc[0] for desc in cursor.description]
data = cursor.fetchall()
# Data frame from original question
df = pd.DataFrame(list(data),columns=col)
# Convert the dataframe to an XlsxWriter Excel object
flnm = file.replace('.txt','').strip()
df.to_excel(writer, sheet_name=flnm, index=False)
print(file, " : Successfully Updated.")
else:
print(file, " : Ignoring this File")
else:
print(file, " : Ignoring this File")
# Close the Pandas Excel writer and output the Excel file
writer.save()
ctypes.windll.user32.MessageBoxW(0,"Open Case Reporting Data Successfully Updated","Open Case Reporting",1)
Related
I am trying to save a pandas dataframe as an excel table to a sharepoint site. I have two separate blocks of code which achieve the below.(thanks for Stackoverflow community)
A script which can save a pandas df as excel table using ExcelWriter on local storage.
A Script which can save a local file to Sharepoint online.
I am very confused on how to combine these two to save a df to sharepoint online but the excel file should be a table not just a range of data. Kindly help
SCRIPT 1 to save excel range as table on local
##############################################################################
#
# An example of adding a dataframe to an worksheet table in an xlsx file
# using Pandas and XlsxWriter.
#
# Tables in Excel are used to group rows and columns of data into a single
# structure that can be referenced in a formula or formatted collectively.
#
# SPDX-License-Identifier: BSD-2-Clause
# Copyright 2013-2021, John McNamara, jmcnamara#cpan.org
#
import pandas as pd
# Create a Pandas dataframe from some data.
df = pd.DataFrame({
'Country': ['China', 'India', 'United States', 'Indonesia'],
'Population': [1404338840, 1366938189, 330267887, 269603400],
'Rank': [1, 2, 3, 4]})
# Order the columns if necessary.
df = df[['Rank', 'Country', 'Population']]
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('pandas_table.xlsx', engine='xlsxwriter')
# Write the dataframe data to XlsxWriter. Turn off the default header and
# index and skip one row to allow us to insert a user defined header.
df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Get the dimensions of the dataframe.
(max_row, max_col) = df.shape
# Create a list of column headers, to use in add_table().
column_settings = [{'header': column} for column in df.columns]
# Add the Excel table structure. Pandas will add the data.
worksheet.add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
# Make the columns wider for clarity.
worksheet.set_column(0, max_col - 1, 12)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
SCRIPT 2 to save any file to sharepoint
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
import os
baseurl = 'https://testsite.sharepoint.com/'
basesite = '/sites/project' # every share point has a home.
siteurl = baseurl + basesite
localpath = "pandas_table.xlsx"
remotepath = "Shared Documents/General/file.xlsx" # existing folder path under sharepoint site.
ctx_auth = AuthenticationContext(siteurl)
ctx_auth.acquire_token_for_user(<username>, <password>)
ctx = ClientContext(siteurl, ctx_auth) # make sure you auth to the siteurl.
with open(localpath, 'rb') as content_file:
file_content = content_file.read()
dir, name = os.path.split(remotepath)
file = ctx.web.get_folder_by_server_relative_url(dir).upload_file(name, file_content).execute_query()
I am not really sure how I can use the writer.save() with the sharepoint connector. Kindly advise
thanks in advance.
Given that you have your data in a df the following code will write to sharepoint using the O365 library.
from io import BytesIO
from tempfile import gettempdir
from O365 import Account, FileSystemTokenBackend
import pandas as pd
O365_CLIENT_ID = "client"
O365_SECRET = "secret"
O365_TENANT_ID = "<name>"
O365_SHAREPOINT = "<name>.sharepoint.com"
O365_SITE = "/sites/..."
def save_file(folder_path, filename, data):
"""save file to O365."""
account = Account(
(O365_CLIENT_ID, O365_SECRET),
auth_flow_type="credentials",
tenant_id=O365_TENANT_ID,
token_backend=FileSystemTokenBackend(
token_path=gettempdir(), token_filename="o365_token.txt"
),
)
if account.authenticate():
drive = (
account.sharepoint()
.get_site(O365_SHAREPOINT, O365_SITE)
.get_default_document_library()
)
subfolders = folder_path.split("/")
if len(subfolders) != 0:
items = drive.get_items()
for subfolder in subfolders:
try:
subfolder_drive = list(filter(lambda x, sf=subfolder: sf in x.name, items))[0]
items = subfolder_drive.get_items()
except Exception as excep: # pylint: disable=broad-except
raise f"Path {folder_path} does not exist." from excep
else:
subfolder_drive = drive.get_root_folder()
subfolder_drive.upload_file(
item=None,
item_name=filename,
stream=data,
stream_size=data.getbuffer().nbytes,
)
with BytesIO() as buf:
df.to_excel(buf, index=False)
buf.seek(0)
save_file('folder/to/upload/to', 'filename.xlsx', buf)
Writing to multiple sheets:
with BytesIO() as buf:
with pd.ExcelWriter( # pylint: disable=abstract-class-instantiated
buf,
engine="xlsxwriter",
) as writer:
df.to_excel(
writer,
sheet_name='sheet1',
index=False,
)
df.to_excel(
writer,
sheet_name='sheet2',
index=False,
)
buf.seek(0)
save_file('folder/to/upload/to', 'filename.xlsx', buf)
my propose:
if excel file not exist, create it and copy data table to it;
if excel file exist, copy to data table to new sheet.
but following code running, only copy to data to new sheet, original sheet in excel file was removed.
import os
import pandas as pd
import openpyxl
f_name = "123.xlsx" #target excel file
if os.path.exists(f_name):
"""if excel file exist, added table to another sheet"""
wb = openpyxl.load_workbook(f_name) #load excel file
writer = pd.ExcelWriter(f_name, engine="openpyxl")
writer.wb = wb
df = pd.DataFrame(pd.read_excel("table_2.xlsx")) #get table to be added excel file
df.to_excel(writer, sheet_name="sheet2",index=False) #write to another sheet
writer.save()
writer.close()
else:
"""if excel file not exit, create it"""
df_1 = pd.DataFrame() # create excel file
df_1.to_excel(f_name)
writer = pd.ExcelWriter(f_name)
df_2 = pd.DataFrame(pd.read_excel("table_1.xlsx")) # get table_1
df_2.to_excel(writer, sheet_name="sheet1",index=False) # write table_1 into excel file
writer.save()
writer.close()
import os
import pandas as pd
import openpyxl
f_name = "123.xlsx" #target excel file
if os.path.exists(f_name):
"""if excel file exist, added table to another sheet"""
wb = openpyxl.load_workbook(f_name) #load excel file
writer = pd.ExcelWriter(f_name, engine="openpyxl") #assign engine
writer.book = wb #overwrite if no this
df = pd.DataFrame(pd.read_excel("table_2.xlsx")) #get table to be added excel file
df.to_excel(writer, sheet_name="table_2",index=False) #write to another sheet
writer.save()
writer.close()
else:
"""if excel file not exit, create it"""
df_1 = pd.DataFrame() # create excel file
df_1.to_excel(f_name)
writer = pd.ExcelWriter(f_name)
df_2 = pd.DataFrame(pd.read_excel("table_1.xlsx")) # get table_1
df_2.to_excel(writer, sheet_name="table_1",index=False) # write table_1 into excel file
writer.save()
writer.close()
I have multiple directories, each of which containing any number of .xls files.
I'd like to take the files in any given directory and combine them into one .xls file, using the file names as the tab names.
For example if there are the files NAME.xls, AGE.xls, LOCATION.xls, I'd like to combine them into a new file with the data from NAME.xls on a tab called NAME, the data from AGE.xls on a tab called AGE and so on.
Each source .xls file only has one column of data with no headers.
This is what I have so far, and well it's not working.
Any help would be greatly appreciated (I'm fairly new to Python and I've never had to do anything like this before).
wkbk = xlwt.Workbook()
xlsfiles = glob.glob(os.path.join(path, "*.xls"))
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
tabNames = []
for OF in onlyfiles:
if str(OF)[-4:] == ".xls":
sheetName = str(OF)[:-4]
tabNames.append(sheetName)
else:
pass
for TN in tabNames:
outsheet = wkbk.add_sheet(str(TN))
data = pd.read_excel(path + "\\" + TN + ".xls", sheet_name="data")
data.to_excel(path + "\\" + "Combined" + ".xls", sheet_name = str(TN))
Here is a small helper function - it supports both .xls and .xlsx files:
import pandas as pd
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
def merge_excel_files(dir_name, out_filename='result.xlsx', **kwargs):
p = Path(dir_name)
with pd.ExcelWriter(out_filename) as xls:
_ = [pd.read_excel(f, header=None, **kwargs)
.to_excel(xls, sheet_name=f.stem, index=False, header=None)
for f in p.glob('*.xls*')]
Usage:
merge_excel_files(r'D:\temp\xls_directory', 'd:/temp/out.xls')
merge_excel_files(r'D:\temp\xlsx_directory', 'd:/temp/out.xlsx')
Can you try
import pandas as pd
import glob
path = 'YourPath\ToYour\Files\\' # Note the \\ at the end
# Create a list with only .xls files
list_xls = glob.glob1(path,"*.xls")
# Create a writer for pandas
writer = pd.ExcelWriter(path + "Combined.xls", engine = 'xlwt')
# Loop on all the files
for xls_file in list_xls:
# Read the xls file and the sheet named data
df_data = pd.read_excel(io = path + xls_file, sheet_name="data")
# Are the sheet containing data in all your xls file named "data" ?
# Write the data into a sheet named after the file
df_data.to_excel(writer, sheet_name = xls_file[:-4])
# Save and close your Combined.xls
writer.save()
writer.close()
Let me know if it works for you, I never tried engine = 'xlwt' as I don't work with .xls file but .xlsx
First of all, I am new to python (practically I have learned only from Sololearn, that too only up to half course). So I request you to give me a little bit detailed answer.
My task has following broad steps:-
Delete old .xlsx file(if any)
Convert two .xls files into .xlsx file using win32, delete the first row and then delete .xls file [weird xls files already downloaded into source directory + xlrd,pyexcel show error (unsupported format or corrupt) file in opening .xls file (online analysis of file predicts it to be html/htm) ]
Get data from xlsx file
First, delete old worksheet on google spreadsheet to remove old data. Create a new worksheet with the same name. Insert data into new worksheet on the google spreadsheet.
Open second sheet(which imports data from the first sheet) and update one cell in Dummy Sheet to make sure google spreadsheet is synchronised in the background.
Now, I wrote a code by combining many codes and by using a lot of google.
The code is working fine but it takes on an avg about 65 seconds to complete the whole process.
My question has 3 parts:-
Is there any way to directly access data from .xls file?
Is there any way this code's performance can be improved.
Any other more efficient method for completing the above-said task?
My Code:-
import time
import win32com.client as win32
import os
import openpyxl
from openpyxl.utils import get_column_letter
import gspread
from oauth2client.service_account import ServiceAccountCredentials
start = time.time()
# set input-output file locations
source_dir = "C:\\Users\\XYZ\\Downloads"
output_dir = "C:\\Users\\XYZ\\Excels"
# use creds to create a client to interact with the Google Drive API
# make sure to share files with email contained in json file
scope = ['https://spreadsheets.google.com/feeds']
# code will not work without json file
creds = ServiceAccountCredentials.from_json_keyfile_name("C:\\Users\\XYZ\\your.json", scope)
gc = gspread.authorize(creds)
# following code is to open any spreadsheet by name
sh = gc.open("First Sheet")
def save_as_xlsx(input_file,output_dir,output_file_name) :
# call excel using win32, then open .xls file
# delete first row and then save as .xlsx
excel = win32.gencache.EnsureDispatch('Excel.Application')
wb = excel.Workbooks.Open(input_file)
wbk = excel.ActiveWorkbook
sheet = wbk.Sheets(1)
sheet.Rows(1).Delete()
wb.SaveAs(output_dir + '\\' + output_file_name, FileFormat = 51)
#FileFormat = 51 is for .xlsx extension. FileFormat = 56 is for .xls extension
wb.Close()
excel.Application.Quit()
return True
def get_the_data_from_xlsx(output_dir,output_file_name) :
# use openpyxl.load to find out last cell of file
# store cell values in list called data
wb = openpyxl.load_workbook(output_dir + '\\' + output_file_name)
sheet = wb.active
max_row_no = sheet.max_row
max_column_no = sheet.max_column
max_column = get_column_letter(max_column_no)
last_cell = str(max_column) + str(max_row_no)
cell_addresses = sheet['A1' : last_cell]
data = []
for i in cell_addresses :
for e in i :
data.append(e.value)
return (data,last_cell)
def insert_data_into_spreadsheet(name_of_worksheet,data,last_cell) :
# Find a workbook by name in already opened spreadsheet
# delete the worksheet to clear old data
# create worksheet with same name to maintain import connections in sheets.
worksheet = sh.worksheet(name_of_worksheet)
sh.del_worksheet(worksheet)
worksheet = sh.add_worksheet(title=name_of_worksheet, rows="500", cols="30")
# store range of cells for spreadsheet in list named cell_list
cell_list = worksheet.range('A1' + ':' + str(last_cell))
# attach all the values from data list as per the cell_list
a = 0
for cell in cell_list :
cell.value = data[a]
a = a + 1
# update all cells stored in cell_list in one go
worksheet.update_cells(cell_list)
def delete_file(directory,file_initials) :
for filename in os.listdir(directory) :
if filename.startswith(file_initials) :
os.unlink(directory +"\\" + filename)
# check if files are in source_dir
for filename in os.listdir(source_dir) :
# check for file1.xls and set input_file name if any file exists.
if filename.startswith("file1"):
input_file = source_dir + "\\file1.xls"
output_file1 = "output_file1.xlsx"
# detect and delete any old file in output directory
delete_file(output_dir,"output_file1")
if save_as_xlsx(input_file,output_dir,output_file1) == True :
# delete the file from source directory after work is done
delete_file(source_dir,'file1')
# get data from new xlsx file
data_from_xlsx = get_the_data_from_xlsx(output_dir,output_file1)
data_to_spreadsheet = data_from_xlsx[0]
last_cell = data_from_xlsx[1]
# insert updated data into spreadsheet
insert_data_into_spreadsheet("file1_data",data_to_spreadsheet,last_cell)
# repeat the same process for 2nd file
if filename.startswith('file2'):
input_file = source_dir + "\\file2.xls"
output_file2 = "output_file2.xlsx"
delete_file(output_dir,"output_file2")
if save_as_xlsx(input_file,output_dir,output_file2) == True :
delete_file(source_dir,'file2')
data_from_xlsx = get_the_data_from_xlsx(output_dir,output_file2)
data_to_spreadsheet = data_from_xlsx[0]
last_cell = data_from_xlsx[1]
insert_data_into_spreadsheet("file2_data",data_to_spreadsheet,last_cell)
# open spreadsheet by name and open Dummy worksheet
# update one cell to sync the sheet with other sheets
sh = gc.open("second sheet")
worksheet = sh.worksheet("Dummy")
worksheet.update_acell('B1', '=Today()')
end = time.time()
print(end-start)
I have a dataframe that I'm exporting to Excel, and people want it in .xlsx. I use to_excel, but when I change the extension from .xls to .xlsx, the exporting step takes about 9 seconds as opposed to 1 second. Exporting to a .csv is even faster, which I believe is due to the fact that it's just a specially formatted text file.
Perhaps the .xlsx files just added a lot more features so it takes longer to write to them, but I'm hoping there is something I can do to prevent this.
Pandas defaults to using OpenPyXL for writing xlsx files which can be slower than than the xlwt module used for writing xls files.
Try it instead with XlsxWriter as the xlsx output engine:
df.to_excel('file.xlsx', sheet_name='Sheet1', engine='xlsxwriter')
It should be as fast as the xls engine.
As per different Python to Excel modules benchmark, pyexcelerate has better performance.
Below code used to take sqlite tables data into xlsx file datasheets. table is not stored in xlsx file unless raw size is less than 1000000 raws. In that case info is stored in csv file.
def passfile(datb, tables):
"""copy to xlsx or csv files tables from query results"""
import sqlite3
import pandas as pd
import timeit
import csv
from pyexcelerate import Workbook
from pathlib import Path
from datetime import date
dat_dir = Path("C:/XML")
db_path = dat_dir / datb
start_time = timeit.default_timer()
conn = sqlite3.connect(db_path) # database connection
c = conn.cursor()
today = date.today()
tablist = []
with open(tables, 'r') as csv_file: # tables to be collected file
csv_reader = csv.DictReader(csv_file)
for line in csv_reader:
tablist.append(line['table']) #column header
xls_file = "Param" + today.strftime("%y%m%d") + ".xlsx"
xls_path = dat_dir / xls_file # xls file path-name
csv_path = dat_dir / "csv" # csv path to store big data
wb = Workbook() # excelerator file init
for line in tablist:
try:
df = pd.read_sql_query("select * from " + line + ";", conn) # pandas dataframe from sqlite
if len(df) > 1000000: # excel not supported
print('save to csv')
csv_loc = line + today.strftime("%y%m%d") + '.csv.gz' # compressed csv file name
df.to_csv(csv_path / csv_loc, compression='gzip')
else:
data = [df.columns.tolist()] + df.values.tolist()
data = [[index] + row for index, row in zip(df.index, data)]
wb.new_sheet(line, data=data)
except sqlite3.Error as error: # sqlite error handling
print('SQLite error: %s' % (' '.join(error.args)))
print("saving workbook")
wb.save(xls_path)
end_time = timeit.default_timer()
delta = round(end_time - start_time, 2)
print("Took " + str(delta) + " secs")
c.close()
conn.close()
passfile("20200522_sqlite.db", "tablesSQL.csv")