I would create a python class "ExcelFile" to handle adding multiple sheet in a workbook,
The function should be generic , so any one of the team could use it easily,
I did the developpement and everything is fine,my code is like :
def addSheet(df,"sheet_name1"):
-- reading template
-- add sheet1
-- replace existing file
and after I call the function many times, it depends how much sheet i want to add,so:
addSheet(df1,"sheet_name1")
addSheet(df2,"sheet_name2")
addSheet(df3,"sheet_name3")
I want to refactor my code and put it into a python Class and implement a design pattern that will help me to do the job by calling
xls_file = ExcelFile().useTemplate("template_path").addSheet(df,"sheet_name1").addSheet(df2,"sheet_name2").writeXlsFile("filename")
What'is the name of the design pattern to do something like this?
So after searching :
I did the chain method
import pandas as pd
import openpyxl
class ExcelFile(object):
template_path = None
file_path = None
book = None
xlwriter = None
def replaceSheet(self, dataframe, sheet_name):
pandas_df = dataframe.toPandas()
self.book = openpyxl.load_workbook(self.template_path)
self.xlwriter = pd.ExcelWriter(self.template_path, engine='openpyxl')
# test if the sheet doesn't exist
sheet_id = self.book.get_sheet_by_name(sheet_name)
self.book.remove(sheet_id)
self.xlwriter.book = self.book
pandas_df.to_excel(self.xlwriter, sheet_name=sheet_name, index=False, header=True)
return self
def useTemplate(self, template_path):
self.template_path = template_path
return self
def writeFile(self, file_path):
self.book.save(file_path)
self.xlwriter.save()
self.xlwriter.close()
return file_path
Anf instead of calling the function many times , I call by :
xls_file = ExcelFile()
xls_file.useTemplate(template_path=templatepath) \
.replaceSheet(dataframe=Sales_df, sheet_name="DB Sales") \
.replaceSheet(dataframe=cost_df, sheet_name="DB COST") \
.replaceSheet(dataframe=b2c_df, sheet_name="DB B2C") \
.writeFile(file_path=local_tmp_file)
Related
I have a class that reads a dataframe and then another class which processes that dataframe. the functions in the processing class should be applied on the same dataframe step by step to shape the final dataframe which is then saved as a csv file.
from pydantic import BaseModel
from config import DATA_REPO
import pandas as pd
import os
class PandaDataFrame(BaseModel):
data: pd.DataFrame
class Config:
arbitrary_types_allowed = True
class Directory(BaseModel):
data_directory: str
class DataToPandaReader(object):
def csv_file_reader(self, directory: Directory):
directory = directory.data_directory
for file in os.listdir(directory):
if file.endswith('.csv'):
return pd.read_csv(os.path.join(directory, file))
class DataProcessor(object):
def remove_punctuation(self, my_: PandaDataFrame):
my_data_to_process = my_.data
for col in my_data_to_process:
if any(word in col for word in ['example', 'text', 'Answer']):
my_data_to_process = my_data_to_process[col].str.replace('[^\w\s]', '', regex=True)
return add_number_column(my_data_to_process)
def add_number_column(self, my_: PandaDataFrame):
my_data_to_process = my_.data
my_data_to_process['sentence_number'] = range(len(my_data_to_process))
return save_final_dataframe(my_data_to_process)
def save_final_dataframe(self, my_:PandaDataFrame):
my_data_to_process = my_.data
return my_data_to_process.to_csv('final_data.csv')
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
now, for example to instantiate the first function in DataProcessor class, I would do the following
DataProcessor().remove_punctuation(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
but my intention is to run all these function in the DataProcessor class step by step, so the save_final_dataset function would save the dataframe that is has its punctuation removed and also has a number column.
update:
following the answer given, I made these changes, but get the error that the functions are not known.
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
Unless I've misunderstood your use-case, all you need to do is replace
return my_data_to_process
...in the remove_punctuation function with
return add_number_column(my_data_to_process)
...then replace
return my_data_to_process
...in the add_number_column function with
return save_final_dataframe(my_data_to_process)
I have tried this code, but this is for validation. I want a code to create a new drop down menu.
app = xw.App(visible=True)
wb = app.books.open('Test.xlsx')
sht = wb.sheets['Sheet1']
Formula1='"Dog,Cat,Bat"'
dv = sht.range('A1').api.Validation.Formula1
I have tried using openpyxl it is working but it doesn't save a file when the file is open.
xlwings is a wrapper of win32com, which is similar to VBA. With recording VBA macro for reference, the following code should work.
import xlwings as xw
app = xw.App(visible=True)
wb = app.books.open('Test.xlsx')
sht = wb.sheets['Sheet1']
Formula1='Dog,Cat,Bat' # remove the redundant "
# set up validation
sht.range('A1').api.Validation.Add(Type=3, Formula1=Formula1)
The exploring steps:
Record a macro for the default steps to create a list type data validation:
Sub Macro1()
'
' Macro1 Macro
'
'
Range("A1").Select
Application.WindowState = xlMaximized
With Selection.Validation
.Delete
.Add Type:=xlValidateList, AlertStyle:=xlValidAlertStop, Operator:= _
xlBetween, Formula1:="Dog,Cat,Bat"
.IgnoreBlank = True
.InCellDropdown = True
.InputTitle = ""
.ErrorTitle = ""
.InputMessage = ""
.ErrorMessage = ""
.ShowInput = True
.ShowError = True
End With
End Sub
Indeed, we did only two key steps and kept the rest default:
select List from validation criteria dropdown list
type the source for list
So, the above code should be simplified as
' VBA
Range("A1").Validation.Add Type:=xlValidateList, Formula1:="Dog,Cat,Bat"
I've written a simple script to save out the names of various subfolders into a spreadsheet. It seems to be doing its job at every point up to the return statement. It returns None...
If I add a print statement before the return I can see a populated dataFrame.
I guess I'm missing something obvious, would appreciate some help!
Thanks
import sys, os, glob
from glob import glob
import pandas as pd
def findSubFoldersMultiple(iter,data_container):
if iter > 0:
current_directory = sys.argv[iter]
directory_reformatted = sys.argv[iter] + "/*/"
folders = glob(directory_reformatted )
folders_stripped = [ folder.replace(sys.argv[iter],'').replace('/','') for folder in folders]
curr_data_container = pd.DataFrame({ current_directory: folders_stripped })
combined_data_container = pd.concat([data_container,curr_data_container],axis=1)
findSubFoldersMultiple(iter-1,combined_data_container)
else:
print('Populated container in loop: \n' )
print(data_container)
return data_container
if len(sys.argv)<2:
print ("Please specify directory/directories.")
else:
writer = pd.ExcelWriter('subfolders.xlsx')
empty_frame = pd.DataFrame({})
populated_DF = findSubFoldersMultiple(len(sys.argv) - 1, empty_frame)
print('Returned container: \n' )
print(populated_DF)
Catch the return value by changing the last line in the if block to:
return findSubFoldersMultiple(iter-1,combined_data_container)
Otherwise you're returning the value on the base case (the else block), but not returning it further up the chain of non-base case recursive calls.
I am trying to test this function that takes a pandas dataframe row which it uses to make an ftp call saved to csv, opens that csv file, formats it, and saves it as a pickle.
I want to test the following:
builtins.open is called once with (path_to_raw, 'wb')
to_pickle is called once with (LOCAL_PKL.format(row.name))
Patching builtins.open does not seem to work since it is called indirectly by to_pickle, so the tests fail as builtins.open is called twice.
Function to Test:
def download_file(row):
path_from = row['source']
path_to_raw = LOCAL_RAW.format(row.name)
self.connection = FTP(self.url)
self.connection.login(self.username, self.password)
with open(path_to_raw, 'wb') as f:
self.connection.retrbinary('RETR ' + path_from, f.write)
self.connection.quit()
data = pd.read_csv(path_to_raw)
data.columns = ['a','b','c']
data.to_pickle(LOCAL_PKL.format(row.name))
Unit Tests:
import pandas as pd
import unittest.mock as mock
from unittest.mock import patch, mock_open, MagicMock, call
import maintain
#patch('builtins.open', create=True)
#patch('maintain.pd.read_csv')
def test_download_path(self, mock_open, mock_pd_read_csv):
mock_pd_read_csv.return_value = pd.DataFrame()
#mock.create_autospec
def mock_pd_to_pickle(self, path):
pass
with patch.object(pd.DataFrame, 'to_pickle', mock_pd_to_pickle):
real = maintain.DataFTP()
real.connection = MagicMock(name='connection')
row = pd.Series(data=['a','b'], index=['c','d'])
row.name = 'anything'
print(mock_open.assert_called_once_with(maintain.LOCAL_RAW.format(row.name), 'wb'))
print(mock_pd_to_pickle.assert_called_once_with(maintain.LOCAL_PKL.format(row.name)))
So... this is clear wrong, but I'm not sure why.
This test produces this error:
AssertionError: Expected 'read_csv' to be called once. Called 0 times.
Does anyone have any suggestions or know how to solve this.
Thank you!
I finally got it working with this:
#patch('builtins.open', new_callable=mock_open)
#patch('maintain.pd.read_csv', return_value=pd.DataFrame())
#patch.object(pd.DataFrame, 'to_pickle')
def test_download_path(self, mock_to_pickle, mock_read_csv, mock_open):
real = maintain.EODDataFTP()
real.connection = mock.Mock(name='connection')
row = pd.Series(data=['','nyse'], index=['source','exchange'])
row.name = 'anything'
real.download_file(row)
mock_open.assert_called_once_with(maintain.LOCAL_RAW.format(row.name), 'wb')
mock_read_csv.assert_called_once()
mock_to_pickle.assert_called_once_with(maintain.LOCAL_PKL.format(row.name))
I am trying to create a class and I can't seem to get it to work? I'm fairly new to Python, so any assistance would be appreciated. Also, not sure if this is the most efficient way to create and use an object. I am trying to build a well model and this is one piece of that model, once I get this simple issue figured out the rest should be fairly easy. Thanks.
import sys
import os
import csv
import pyodbc
import pandas as pd
import pandas.io.sql as psql
from pandas import Series, DataFrame
from time import gmtime, strftime
#Drill Pipe Class
class DP:
#Properties
DP_ID = 1.00
DP_OD = 1.00
DP_Name = 'Drill Pipe'
#test global
idwel = '6683AFCEA5DF429CAC123213F85EB9B3'
#Constructor <- Accepts idwell to get info
def __init__(self,idwell):
self.id = idwell
#..
#WV DB connecton Function -> return as dataframe -Updated 7/5/17
def WV_Read_Query(_query):
try:
cnxn = pyodbc.connect("DSN=SQL_R_WV")
cur = cnxn.cursor()
df = psql.read_sql(_query, cnxn)
cnxn.close()
#print(df)
return df
except "Error":
return "Query Error...!"
#..
def get_DP_Data(_id):
_id = str(_id)
DP_Query = """Select Top 1
DS.des as 'dp_name',DS.SZIDNOM as 'dp_id',
DS.SZODNOM as 'dp_od',DS.SYSCREATEDATE as 'date'
From [dbo].[US_WVJOBDRILLSTRINGCOMP] DS
Where IDWELL = '""" + _id +"""'
AND Des = 'Drill Pipe' Order by SYSCREATEDATE Desc"""
mud_Data = WV_Read_Query(DP_Query)
return mud_Data
#..
DP_Table = get_DP_Data(id)
def get_DP_ID(self, DP_Table):
dp_id = DP_Table['dp_id']
return dp_id
#..
def get_DP_OD(self, DP_Table):
dp_od = DP_Table['dp_od']
return dp_od
#..
def get_Date(self, DP_Table):
u_date = DP_Table['date']
return u_date
#..
def get_Des(self, DP_Table):
des = DP_Table['dp_name']
return des
#..
#Print DP Info
def DP_Info(self):
Des = get_Des()
ID = get_DP_ID()
OD = get_DP_OD()
Updated = strftime("%Y-%m-%d %H:%M:%S", gmtime())
return Des + "\nDP Id:\t" + ID + "\nDP Id:\t" + OD + "\nUpdated:\t" + Updated
#..
#...
dp = DP('6683AFCEA5DF429CAC123213F85EB9B3')
dp_info = dp.DP_Info()
print(dp_info)
Traceback (most recent call last): File "u:\Development\Python
Scripts\HCP\CUC Export Files 8_7_17\Well_Model.py", line 71, in
class DP: File "u:\Development\Python Scripts\HCP\CUC Export Files 8_7_17\Well_Model.py", line 108, in DP
DP_Table = get_DP_Data(id) File "u:\Development\Python Scripts\HCP\CUC Export Files 8_7_17\Well_Model.py", line 104, in
get_DP_Data
mud_Data = WV_Read_Query(DP_Query) NameError: name 'WV_Read_Query' is not defined
If you are defining non-static, non-class methods within a function, the first argument is always an instance of that class. We usually call this argument self:
def WV_Read_Query(self, _query):
...
And,
def get_DP_Data(self, _id):
Furthermore, you call a these methods on the object self:
self.WV_Read_Query(DP_Query)
You might wonder why the function is defined with 2 arguments, but only 1 passed to. That's because the instance is implicitly passed as the first parameter, automatically.
This is equivalent to
DP.WV_Read_Query(self, DP_Query)
Where you call the method on the class, but explicitly pass the instance to it.
Further reading:
Python classes
What is the difference between class and instance methods?
You need to access it with self. So
def get_DP_Data(self, _id):
_id = str(_id)
DP_Query = """Select Top 1
DS.des as 'dp_name',DS.SZIDNOM as 'dp_id',
DS.SZODNOM as 'dp_od',DS.SYSCREATEDATE as 'date'
From [dbo].[US_WVJOBDRILLSTRINGCOMP] DS
Where IDWELL = '""" + _id +"""'
AND Des = 'Drill Pipe' Order by SYSCREATEDATE Desc"""
mud_Data = self.WV_Read_Query(DP_Query)
return mud_Data
You also need to add self to several of your methods. The class instance will always be the first parameter in a method unless you define it as a staticmethod using the decorator staticmethod.