I have a class that reads a dataframe and then another class which processes that dataframe. the functions in the processing class should be applied on the same dataframe step by step to shape the final dataframe which is then saved as a csv file.
from pydantic import BaseModel
from config import DATA_REPO
import pandas as pd
import os
class PandaDataFrame(BaseModel):
data: pd.DataFrame
class Config:
arbitrary_types_allowed = True
class Directory(BaseModel):
data_directory: str
class DataToPandaReader(object):
def csv_file_reader(self, directory: Directory):
directory = directory.data_directory
for file in os.listdir(directory):
if file.endswith('.csv'):
return pd.read_csv(os.path.join(directory, file))
class DataProcessor(object):
def remove_punctuation(self, my_: PandaDataFrame):
my_data_to_process = my_.data
for col in my_data_to_process:
if any(word in col for word in ['example', 'text', 'Answer']):
my_data_to_process = my_data_to_process[col].str.replace('[^\w\s]', '', regex=True)
return add_number_column(my_data_to_process)
def add_number_column(self, my_: PandaDataFrame):
my_data_to_process = my_.data
my_data_to_process['sentence_number'] = range(len(my_data_to_process))
return save_final_dataframe(my_data_to_process)
def save_final_dataframe(self, my_:PandaDataFrame):
my_data_to_process = my_.data
return my_data_to_process.to_csv('final_data.csv')
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
now, for example to instantiate the first function in DataProcessor class, I would do the following
DataProcessor().remove_punctuation(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
but my intention is to run all these function in the DataProcessor class step by step, so the save_final_dataset function would save the dataframe that is has its punctuation removed and also has a number column.
update:
following the answer given, I made these changes, but get the error that the functions are not known.
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
Unless I've misunderstood your use-case, all you need to do is replace
return my_data_to_process
...in the remove_punctuation function with
return add_number_column(my_data_to_process)
...then replace
return my_data_to_process
...in the add_number_column function with
return save_final_dataframe(my_data_to_process)
Related
I have a core class where I am trying to read the zip file, unzip, get a specific file, and get the contents of that file. This works fine but now I am trying to mock all the things I used along the way.
class ZipService:
def __init__(self, path: str):
self.path = path
def get_manifest_json_object(self):
s3 = boto3.resource('s3')
bucket_name, key = self.__get_bucket_and_key()
bucket = s3.Bucket(bucket_name)
zip_object_reference = bucket.Object(key).get()["Body"]
zip_object_bytes_stream = self.__get_zip_object_bytes_stream(zip_object_reference)
zipf = zipfile.ZipFile(zip_object_bytes_stream, mode='r')
return self.__get_manifest_json(zipf)
def __get_bucket_and_key(self):
pattern = "https:\/\/(.?[^\.]*)\.(.?[^\/]*)\/(.*)" # this regex is working but don't know how :D
result = re.split(pattern, self.path)
return result[1], result[3]
def __get_zip_object_bytes_stream(self, zip_object_reference):
return io.BytesIO(zip_object_reference.read())
def __get_manifest_json(self, zipf):
manifest_json_text = [zipf.read(name) for name in zipf.namelist() if "/manifest.json" in name][0].decode("utf-8")
return json.loads(manifest_json_text)
For this I have written a test case that throws an error:
#patch('boto3.resource')
class TestZipService(TestCase):
def test_zip_service(self, mock):
s3 = boto3.resource('s3')
bucket = s3.Bucket("abc")
bucket.Object.get.return_value = "some-value"
zipfile.ZipFile.return_value = "/some-path"
inst = ZipService("/some-path")
with mock.patch.object(inst, "_ZipService__get_manifest_json", return_value={"a": "b"}) as some_object:
expected = {"a": "b"}
actual = inst.get_manifest_json_object()
self.assertIsInstance(expected, actual)
Error:
bucket_name, key = self.__get_bucket_and_key()
File "/Us.tox/py38/lib/python3.8/site-packages/services/zip_service.py", line 29, in __get_bucket_and_key
return result[1], result[3]
IndexError: list index out of range
What exactly is wrong here? Any hints would also be appreciated. TIA
You are giving your ZipService a path of "/some-path".
Then you test its get_manifest_json_object method, whose 2nd statement calls __get_bucket_and_key.
You are not mocking __get_bucket_and_key, so when it's called it tries to process that input path with a regex split, which won't give you a collection with 4 items that it needs to return result[1], result[3].
Hence, IndexError: list index out of range.
Either give your ZipService a proper path you'd expect, or mock all private methods used in get_manifest_json_object.
I have function that looks like this:
def file1_exists(directory):
file1_path = os.path.join(directory, 'file1.json')
return os.path.exists(file1_path)
def file2_exists(directory):
log_path = os.path.join(directory, 'file2.log')
return os.path.exists(file2_path)
def create_file1(directory):
if file1_exists(directory):
return
if not file2_exists(directory):
return
mod_time = os.stat(os.path.join(directory, 'file2.log')).st_mtime
timestamp = {
"creation_timestamp": datetime.datetime.fromtimestamp(mod_time).isoformat()
}
with open(os.path.join(directory, "file1.json"), "w") as f:
json.dump(timestamp, f)
And I need to create a unittest that uses mock files.
The 3 Unittests that I need are:
A mock myfile.json file where I will assert that the function will return None (based on the 1st if statement, since the file exists)
A way to mock-hide the data.txt item in order to assert that the function will return None (based on the second if statement)
A mock myfile.json file where I write the required data and then assert that the return matches the expected outcome.
So far I've tried tests 1. and 2. with variations of this but I've been unsuccessful:
class TestAdminJsonCreation(unittest.TestCase):
#patch('os.path.exists', return_value=True)
def test_existing_admin_json(self):
self.assertNone(postprocess_results.create_json_file())
I've also read about other solutions such as:
Python testing: using a fake file with mock & io.StringIO
But I haven't found a way to successfully do what I need...
You want to be able to provide different return values for each call to os.path.exists. Since you know the order of the calls, you can use side_effects to supply a list of values to be used in order.
class TestAdminJsonCreation(unittest.TestCase):
# No JSON file
#patch('os.path.exists', return_value=True)
def test_existing_admin_json(self):
self.assertNone(postprocess_results.create_json_file())
# JSON file, log file
#patch('os.path.exists', side_effects=[True, False])
def test_existing_admin_json(self):
self.assertNone(postprocess_results.create_json_file())
# JSON file, no log file
#patch('os.path.exists', side_effects=[True, True])
def test_existing_admin_json(self):
...
The third test requires an actual file system, or for you to mock open.
So, I ended up breaking my original function into 3 different functions for easier testing.
The tests are performed by checking what the result of the 'def create_file1' would be when we feed it different return_values from the other 2 functions and when we add valid data.
class TestFile1JsonCreation(unittest.TestCase):
#patch('builtins.open', new_callable=mock_open())
#patch('os.stat')
#patch('file1_exists', return_value=True)
#patch('file2_exists', return_value=False)
def test_existing_file1_json(self, file2_exists, file1_existsmock, stat, mopen):
create_file1('.')
# file1.json should not have been written
mopen.assert_not_called()
#patch('builtins.open', new_callable=mock_open())
#patch('os.stat')
#patch('file1_exists', return_value=False)
#patch('file2_exists', return_value=False)
def test_missing_file2(self, file2_exists, file1_existsmock, stat, mopen):
create_file1('.')
# file1.json should not have been written
mopen.assert_not_called()
#patch('builtins.open', new_callable=mock_open())
#patch('os.stat')
#patch('file1_exists', return_value=False)
#patch('file2_exists', return_value=True)
def test_write_data(self, file2_exists, file1_existsmock, stat, mopen):
class FakeStat:
st_mtime = 1641992788
stat.return_value = FakeStat()
create_file1('.')
# file1.json should have been written
mopen.assert_called_once_with('./file1.json', 'w')
written_data = ''.join(
c[1][0]
for c in mopen().__enter__().write.mock_calls
)
expected_data = {"creation_timestamp": "2022-01-12T13:06:28"}
written_dict_data = json.loads(written_data)
self.assertEqual(written_dict_data, expected_data)
I have a class FileSet with a method _process_series, which contains a bunch of if-elif blocks doing filetag-specific formatting of different pandas.Series:
elif filetag == "EntityA":
ps[filetag+"_Id"] = str(ps[filetag+"_Id"]).strip()
ps[filetag+"_DateOfBirth"] = str(pd.to_datetime(ps[filetag+"_DateOfBirth"]).strftime('%Y-%m-%d')).strip()
ps[filetag+"_FirstName"] = str(ps[filetag+"_FirstName"]).strip().capitalize()
ps[filetag+"_LastName"] = str(ps[filetag+"_LastName"]).strip().capitalize()
ps[filetag+"_Age"] = relativedelta(datetime.today(), datetime.strptime(ps[filetag+"_DateOfBirth"], "%Y-%m-%d")).years
return ps
I'd like to define an abstract format method in the class and keep these blocks of formatting in separate modules that are imported when _process_series is called for a given filetag. Forgive the pseudo-code, but something like:
for tag in filetag:
from my_formatters import tag+'_formatter' as fmt
ps = self.format(pandas_series, fmt)
return ps
And the module would contain the formatting block:
# my_formatters.EntityA_formatter
ps[filetag+"_Id"] = str(ps[filetag+"_Id"]).strip()
ps[filetag+"_DateOfBirth"] = str(pd.to_datetime(ps[filetag+"_DateOfBirth"]).strftime('%Y-%m-%d')).strip()
ps[filetag+"_FirstName"] = str(ps[filetag+"_FirstName"]).strip().capitalize()
ps[filetag+"_LastName"] = str(ps[filetag+"_LastName"]).strip().capitalize()
ps[filetag+"_Age"] = relativedelta(datetime.today(), datetime.strptime(ps[filetag+"_DateOfBirth"], "%Y-%m-%d")).years
return ps
You can create a function in it's own .py file and import it. If you create the same function in each file you can then call it.
here is f1.py:
def gimme():
return 'format 1'
here is f2.py:
def gimme():
return 'format 2'
Then you main file:
module_names = ['f1','f2']
for module_name in module_names:
import_test = __import__(module_name)
result = import_test.gimme()
result = import_test.gimme()
print(result)
Which gives the output:
format 1
format 2
Why not use globals with asterisk:
from my_formatters import *
for tag in filetag:
fmt = globals()[tag + '_formatter']
ps = self.format(pandas_series, fmt)
return ps
I converted your pseudocode to real code.
globals documentation:
Return a dictionary representing the current global symbol table. This is always the dictionary of the current module (inside a function or method, this is the module where it is defined, not the module from which it is called).
Your psuedocode could be made into real code like so:
import my_formatters
for tag in filetag:
fmt = getattr(my_formatters, tag + '_formatter')
ps = self.format(pandas_series, fmt)
return ps
I would create a python class "ExcelFile" to handle adding multiple sheet in a workbook,
The function should be generic , so any one of the team could use it easily,
I did the developpement and everything is fine,my code is like :
def addSheet(df,"sheet_name1"):
-- reading template
-- add sheet1
-- replace existing file
and after I call the function many times, it depends how much sheet i want to add,so:
addSheet(df1,"sheet_name1")
addSheet(df2,"sheet_name2")
addSheet(df3,"sheet_name3")
I want to refactor my code and put it into a python Class and implement a design pattern that will help me to do the job by calling
xls_file = ExcelFile().useTemplate("template_path").addSheet(df,"sheet_name1").addSheet(df2,"sheet_name2").writeXlsFile("filename")
What'is the name of the design pattern to do something like this?
So after searching :
I did the chain method
import pandas as pd
import openpyxl
class ExcelFile(object):
template_path = None
file_path = None
book = None
xlwriter = None
def replaceSheet(self, dataframe, sheet_name):
pandas_df = dataframe.toPandas()
self.book = openpyxl.load_workbook(self.template_path)
self.xlwriter = pd.ExcelWriter(self.template_path, engine='openpyxl')
# test if the sheet doesn't exist
sheet_id = self.book.get_sheet_by_name(sheet_name)
self.book.remove(sheet_id)
self.xlwriter.book = self.book
pandas_df.to_excel(self.xlwriter, sheet_name=sheet_name, index=False, header=True)
return self
def useTemplate(self, template_path):
self.template_path = template_path
return self
def writeFile(self, file_path):
self.book.save(file_path)
self.xlwriter.save()
self.xlwriter.close()
return file_path
Anf instead of calling the function many times , I call by :
xls_file = ExcelFile()
xls_file.useTemplate(template_path=templatepath) \
.replaceSheet(dataframe=Sales_df, sheet_name="DB Sales") \
.replaceSheet(dataframe=cost_df, sheet_name="DB COST") \
.replaceSheet(dataframe=b2c_df, sheet_name="DB B2C") \
.writeFile(file_path=local_tmp_file)
I've written a simple script to save out the names of various subfolders into a spreadsheet. It seems to be doing its job at every point up to the return statement. It returns None...
If I add a print statement before the return I can see a populated dataFrame.
I guess I'm missing something obvious, would appreciate some help!
Thanks
import sys, os, glob
from glob import glob
import pandas as pd
def findSubFoldersMultiple(iter,data_container):
if iter > 0:
current_directory = sys.argv[iter]
directory_reformatted = sys.argv[iter] + "/*/"
folders = glob(directory_reformatted )
folders_stripped = [ folder.replace(sys.argv[iter],'').replace('/','') for folder in folders]
curr_data_container = pd.DataFrame({ current_directory: folders_stripped })
combined_data_container = pd.concat([data_container,curr_data_container],axis=1)
findSubFoldersMultiple(iter-1,combined_data_container)
else:
print('Populated container in loop: \n' )
print(data_container)
return data_container
if len(sys.argv)<2:
print ("Please specify directory/directories.")
else:
writer = pd.ExcelWriter('subfolders.xlsx')
empty_frame = pd.DataFrame({})
populated_DF = findSubFoldersMultiple(len(sys.argv) - 1, empty_frame)
print('Returned container: \n' )
print(populated_DF)
Catch the return value by changing the last line in the if block to:
return findSubFoldersMultiple(iter-1,combined_data_container)
Otherwise you're returning the value on the base case (the else block), but not returning it further up the chain of non-base case recursive calls.