Python unittest to create a mock .json file - python

I have function that looks like this:
def file1_exists(directory):
file1_path = os.path.join(directory, 'file1.json')
return os.path.exists(file1_path)
def file2_exists(directory):
log_path = os.path.join(directory, 'file2.log')
return os.path.exists(file2_path)
def create_file1(directory):
if file1_exists(directory):
return
if not file2_exists(directory):
return
mod_time = os.stat(os.path.join(directory, 'file2.log')).st_mtime
timestamp = {
"creation_timestamp": datetime.datetime.fromtimestamp(mod_time).isoformat()
}
with open(os.path.join(directory, "file1.json"), "w") as f:
json.dump(timestamp, f)
And I need to create a unittest that uses mock files.
The 3 Unittests that I need are:
A mock myfile.json file where I will assert that the function will return None (based on the 1st if statement, since the file exists)
A way to mock-hide the data.txt item in order to assert that the function will return None (based on the second if statement)
A mock myfile.json file where I write the required data and then assert that the return matches the expected outcome.
So far I've tried tests 1. and 2. with variations of this but I've been unsuccessful:
class TestAdminJsonCreation(unittest.TestCase):
#patch('os.path.exists', return_value=True)
def test_existing_admin_json(self):
self.assertNone(postprocess_results.create_json_file())
I've also read about other solutions such as:
Python testing: using a fake file with mock & io.StringIO
But I haven't found a way to successfully do what I need...

You want to be able to provide different return values for each call to os.path.exists. Since you know the order of the calls, you can use side_effects to supply a list of values to be used in order.
class TestAdminJsonCreation(unittest.TestCase):
# No JSON file
#patch('os.path.exists', return_value=True)
def test_existing_admin_json(self):
self.assertNone(postprocess_results.create_json_file())
# JSON file, log file
#patch('os.path.exists', side_effects=[True, False])
def test_existing_admin_json(self):
self.assertNone(postprocess_results.create_json_file())
# JSON file, no log file
#patch('os.path.exists', side_effects=[True, True])
def test_existing_admin_json(self):
...
The third test requires an actual file system, or for you to mock open.

So, I ended up breaking my original function into 3 different functions for easier testing.
The tests are performed by checking what the result of the 'def create_file1' would be when we feed it different return_values from the other 2 functions and when we add valid data.
class TestFile1JsonCreation(unittest.TestCase):
#patch('builtins.open', new_callable=mock_open())
#patch('os.stat')
#patch('file1_exists', return_value=True)
#patch('file2_exists', return_value=False)
def test_existing_file1_json(self, file2_exists, file1_existsmock, stat, mopen):
create_file1('.')
# file1.json should not have been written
mopen.assert_not_called()
#patch('builtins.open', new_callable=mock_open())
#patch('os.stat')
#patch('file1_exists', return_value=False)
#patch('file2_exists', return_value=False)
def test_missing_file2(self, file2_exists, file1_existsmock, stat, mopen):
create_file1('.')
# file1.json should not have been written
mopen.assert_not_called()
#patch('builtins.open', new_callable=mock_open())
#patch('os.stat')
#patch('file1_exists', return_value=False)
#patch('file2_exists', return_value=True)
def test_write_data(self, file2_exists, file1_existsmock, stat, mopen):
class FakeStat:
st_mtime = 1641992788
stat.return_value = FakeStat()
create_file1('.')
# file1.json should have been written
mopen.assert_called_once_with('./file1.json', 'w')
written_data = ''.join(
c[1][0]
for c in mopen().__enter__().write.mock_calls
)
expected_data = {"creation_timestamp": "2022-01-12T13:06:28"}
written_dict_data = json.loads(written_data)
self.assertEqual(written_dict_data, expected_data)

Related

unable to mock all the private methods using python unittest

I have a core class where I am trying to read the zip file, unzip, get a specific file, and get the contents of that file. This works fine but now I am trying to mock all the things I used along the way.
class ZipService:
def __init__(self, path: str):
self.path = path
def get_manifest_json_object(self):
s3 = boto3.resource('s3')
bucket_name, key = self.__get_bucket_and_key()
bucket = s3.Bucket(bucket_name)
zip_object_reference = bucket.Object(key).get()["Body"]
zip_object_bytes_stream = self.__get_zip_object_bytes_stream(zip_object_reference)
zipf = zipfile.ZipFile(zip_object_bytes_stream, mode='r')
return self.__get_manifest_json(zipf)
def __get_bucket_and_key(self):
pattern = "https:\/\/(.?[^\.]*)\.(.?[^\/]*)\/(.*)" # this regex is working but don't know how :D
result = re.split(pattern, self.path)
return result[1], result[3]
def __get_zip_object_bytes_stream(self, zip_object_reference):
return io.BytesIO(zip_object_reference.read())
def __get_manifest_json(self, zipf):
manifest_json_text = [zipf.read(name) for name in zipf.namelist() if "/manifest.json" in name][0].decode("utf-8")
return json.loads(manifest_json_text)
For this I have written a test case that throws an error:
#patch('boto3.resource')
class TestZipService(TestCase):
def test_zip_service(self, mock):
s3 = boto3.resource('s3')
bucket = s3.Bucket("abc")
bucket.Object.get.return_value = "some-value"
zipfile.ZipFile.return_value = "/some-path"
inst = ZipService("/some-path")
with mock.patch.object(inst, "_ZipService__get_manifest_json", return_value={"a": "b"}) as some_object:
expected = {"a": "b"}
actual = inst.get_manifest_json_object()
self.assertIsInstance(expected, actual)
Error:
bucket_name, key = self.__get_bucket_and_key()
File "/Us.tox/py38/lib/python3.8/site-packages/services/zip_service.py", line 29, in __get_bucket_and_key
return result[1], result[3]
IndexError: list index out of range
What exactly is wrong here? Any hints would also be appreciated. TIA
You are giving your ZipService a path of "/some-path".
Then you test its get_manifest_json_object method, whose 2nd statement calls __get_bucket_and_key.
You are not mocking __get_bucket_and_key, so when it's called it tries to process that input path with a regex split, which won't give you a collection with 4 items that it needs to return result[1], result[3].
Hence, IndexError: list index out of range.
Either give your ZipService a proper path you'd expect, or mock all private methods used in get_manifest_json_object.

Pytest - run same tests for different sets of input data

I have a number of functions that I want to test using pytest.
Throughout my testing, I use several input files that I specify at the top of the script:
import pytest
from mymodule.mymodule import *
test_bam = 'bam/test/test_reads_pb.bwa.bam'
sample_mapping_file = 'tests/test_sample_mapping.txt'
pb_errors_file = 'tests/data/pb_test_out.json'
pb_stats = 'tests/data/pb_stats.json'
I am then running several tests using this input:
#pytest.fixture
def options():
o, a = get_args()
return o
#pytest.fixture
def samples_dict():
d = get_sample_mapping(sample_mapping_file)
return d
#pytest.fixture
def read_stats(options, samples_dict):
stats, bam = clean_reads(options, test_bam, samples_dict)
return stats
#pytest.fixture
def clean_bam(options, samples_dict):
stats, bam = clean_reads(options, test_bam, samples_dict)
return bam
def test_errors(options, samples_dict, clean_bam):
"""Test successful return from find_errors"""
sample, genome, chrom = set_genome(options, test_bam, samples_dict)
base_data = find_errors(options, chrom, sample, clean_bam)
assert base_data
I would like to be able to run the same tests on multiple different sets of input, where test_bam, sample_mapping_file, pb_errors_file and pb_stats will all be different.
What's the best way of running the same tests on different sets of input data?
I've played around with using marks to run input-specific functions:
#pytest.mark.pd
def get_pb_data():
"""Read in all pb-related files"""
#pytest.mark.ab
def get_ab_data():
"""Read in all ab-related files"""
But this doesn't work with the fixtures that I have set up (unless I'm missing something).
Any advice would be great!
use pytest parametrize wrapper.
test_bam = 'bam/test/test_reads_pb.bwa.bam'
sample_mapping_file = 'tests/test_sample_mapping.txt'
pb_errors_file = 'tests/data/pb_test_out.json'
pb_stats = 'tests/data/pb_stats.json'
#pytest.mark.parametrize("config", [test_bam, sample_mapping_file, pb_errors_file, pb_stats])
def do_something(config):
#
It will create multiple test on every config test input and assign to config variable.
#pytest.mark.pd doesn't specify an input type, it adds pd marker to the test which can be used when running the tests, for example running all the tests marked with pd
pytest TestsFolder -m pd
If you want to run the tests on different sets of files you can store the files names in a csv for example and read the sets from there in the test parametrized marker
def data_source():
for files in read_files_groups_from_csv():
yield files
#pytest.mark.parametrize('files', data_source())
def test_errors(options, samples_dict, clean_bam, files):
"""for example, files parameter will be ['bam/test/test_reads_pb.bwa.bam', 'tests/test_sample_mapping.txt', 'tests/data/pb_test_out.json', 'tests/data/pb_stats.json']"""
mark.parametrize will run before the fixtures, so you can send files as a parameter to them as well
#pytest.fixture
def options(files):
d = samples_dict(files[1])
return d
If you don't want to rely on index create a class with files names as attributes and return it from data_source().

How to mock and test python open and pandas to_pickle

I am trying to test this function that takes a pandas dataframe row which it uses to make an ftp call saved to csv, opens that csv file, formats it, and saves it as a pickle.
I want to test the following:
builtins.open is called once with (path_to_raw, 'wb')
to_pickle is called once with (LOCAL_PKL.format(row.name))
Patching builtins.open does not seem to work since it is called indirectly by to_pickle, so the tests fail as builtins.open is called twice.
Function to Test:
def download_file(row):
path_from = row['source']
path_to_raw = LOCAL_RAW.format(row.name)
self.connection = FTP(self.url)
self.connection.login(self.username, self.password)
with open(path_to_raw, 'wb') as f:
self.connection.retrbinary('RETR ' + path_from, f.write)
self.connection.quit()
data = pd.read_csv(path_to_raw)
data.columns = ['a','b','c']
data.to_pickle(LOCAL_PKL.format(row.name))
Unit Tests:
import pandas as pd
import unittest.mock as mock
from unittest.mock import patch, mock_open, MagicMock, call
import maintain
#patch('builtins.open', create=True)
#patch('maintain.pd.read_csv')
def test_download_path(self, mock_open, mock_pd_read_csv):
mock_pd_read_csv.return_value = pd.DataFrame()
#mock.create_autospec
def mock_pd_to_pickle(self, path):
pass
with patch.object(pd.DataFrame, 'to_pickle', mock_pd_to_pickle):
real = maintain.DataFTP()
real.connection = MagicMock(name='connection')
row = pd.Series(data=['a','b'], index=['c','d'])
row.name = 'anything'
print(mock_open.assert_called_once_with(maintain.LOCAL_RAW.format(row.name), 'wb'))
print(mock_pd_to_pickle.assert_called_once_with(maintain.LOCAL_PKL.format(row.name)))
So... this is clear wrong, but I'm not sure why.
This test produces this error:
AssertionError: Expected 'read_csv' to be called once. Called 0 times.
Does anyone have any suggestions or know how to solve this.
Thank you!
I finally got it working with this:
#patch('builtins.open', new_callable=mock_open)
#patch('maintain.pd.read_csv', return_value=pd.DataFrame())
#patch.object(pd.DataFrame, 'to_pickle')
def test_download_path(self, mock_to_pickle, mock_read_csv, mock_open):
real = maintain.EODDataFTP()
real.connection = mock.Mock(name='connection')
row = pd.Series(data=['','nyse'], index=['source','exchange'])
row.name = 'anything'
real.download_file(row)
mock_open.assert_called_once_with(maintain.LOCAL_RAW.format(row.name), 'wb')
mock_read_csv.assert_called_once()
mock_to_pickle.assert_called_once_with(maintain.LOCAL_PKL.format(row.name))

Add metadata to TestCase in Python's unittest

I'd like to add metadata to individual tests in a TestCase that I've written to use Python's unittest framework. The metadata (a string, really) needs to be carried through the testing process and output to an XML file.
Other than remaining with the test the data isn't going to be used by unittest, nor my test code. (I've got a program that will run afterwards, open the XML file, and go looking for the metadata/string).
I've previously used NUnit which allows one to use C# attribute to do this. Specifically, you can put this above a class:
[Property("SmartArrayAOD", -3)]
and then later find that in the XML output.
Is it possible to attach metadata to a test in Python's unittest?
Simple way for just dumping XML
If all you want to do is write stuff to an XML file after every unit test, just add a tearDown method to your test class (e.g. if you have , give it a).
class MyTest(unittest.TestCase):
def tearDown(self):
dump_xml_however_you_do()
def test_whatever(self):
pass
General method
If you want a general way to collect and track metadata from all your tests and return it at the end, try creating an astropy table in your test class's __init__() and adding rows to it during tearDown(), then extracting a reference to your initialized instances of your test class from unittest, like this:
Step 1: set up a re-usable subclass of unittest.TestCase so we don't have to duplicate the table handling
(put all the example code in the same file or copy the imports)
"""
Demonstration of adding and retrieving meta data from python unittest tests
"""
import sys
import warnings
import unittest
import copy
import time
import astropy
import astropy.table
if sys.version_info < (3, 0):
from StringIO import StringIO
else:
from io import StringIO
class DemoTest(unittest.TestCase):
"""
Demonstrates setup of an astropy table in __init__, adding data to the table in tearDown
"""
def __init__(self, *args, **kwargs):
super(DemoTest, self).__init__(*args, **kwargs)
# Storing results in a list made it convenient to aggregate them later
self.results_tables = [astropy.table.Table(
names=('Name', 'Result', 'Time', 'Notes'),
dtype=('S50', 'S30', 'f8', 'S50'),
)]
self.results_tables[0]['Time'].unit = 'ms'
self.results_tables[0]['Time'].format = '0.3e'
self.test_timing_t0 = 0
self.test_timing_t1 = 0
def setUp(self):
self.test_timing_t0 = time.time()
def tearDown(self):
test_name = '.'.join(self.id().split('.')[-2:])
self.test_timing_t1 = time.time()
dt = self.test_timing_t1 - self.test_timing_t0
# Check for errors/failures in order to get state & description. https://stackoverflow.com/a/39606065/6605826
if hasattr(self, '_outcome'): # Python 3.4+
result = self.defaultTestResult() # these 2 methods have no side effects
self._feedErrorsToResult(result, self._outcome.errors)
problem = result.errors or result.failures
state = not problem
if result.errors:
exc_note = result.errors[0][1].split('\n')[-2]
elif result.failures:
exc_note = result.failures[0][1].split('\n')[-2]
else:
exc_note = ''
else: # Python 3.2 - 3.3 or 3.0 - 3.1 and 2.7
# result = getattr(self, '_outcomeForDoCleanups', self._resultForDoCleanups) # DOESN'T WORK RELIABLY
# This is probably only good for python 2.x, meaning python 3.0, 3.1, 3.2, 3.3 are not supported.
exc_type, exc_value, exc_traceback = sys.exc_info()
state = exc_type is None
exc_note = '' if exc_value is None else '{}: {}'.format(exc_type.__name__, exc_value)
# Add a row to the results table
self.results_tables[0].add_row()
self.results_tables[0][-1]['Time'] = dt*1000 # Convert to ms
self.results_tables[0][-1]['Result'] = 'pass' if state else 'FAIL'
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=astropy.table.StringTruncateWarning)
self.results_tables[0][-1]['Name'] = test_name
self.results_tables[0][-1]['Notes'] = exc_note
Step 2: set up a test manager that extracts metadata
def manage_tests(tests):
"""
Function for running tests and extracting meta data
:param tests: list of classes sub-classed from DemoTest
:return: (TextTestResult, Table, string)
result returned by unittest
astropy table
string: formatted version of the table
"""
table_sorting_columns = ['Result', 'Time']
# Build test suite
suite_list = []
for test in tests:
suite_list.append(unittest.TestLoader().loadTestsFromTestCase(test))
combo_suite = unittest.TestSuite(suite_list)
# Run tests
results = [unittest.TextTestRunner(verbosity=1, stream=StringIO(), failfast=False).run(combo_suite)]
# Catch test classes
suite_tests = []
for suite in suite_list:
suite_tests += suite._tests
# Collect results tables
results_tables = []
for suite_test in suite_tests:
if getattr(suite_test, 'results_tables', [None])[0] is not None:
results_tables += copy.copy(suite_test.results_tables)
# Process tables, if any
if len(results_tables):
a = []
while (len(a) == 0) and len(results_tables):
a = results_tables.pop(0) # Skip empty tables, if any
results_table = a
for rt in results_tables:
if len(rt):
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
results_table = astropy.table.join(results_table, rt, join_type='outer')
try:
results_table = results_table.group_by(table_sorting_columns)
except Exception:
print('Error sorting test results table. Columns may not be in the preferred order.')
column_names = list(results_table.columns.keys())
alignments = ['<' if cn == 'Notes' else '>' for cn in column_names]
if len(results_table):
rtf = '\n'.join(results_table.pformat(align=alignments, max_width=-1))
exp_res = sum([result.testsRun - len(result.skipped) for result in results])
if len(results_table) != exp_res:
print('ERROR forming results table. Expected {} results, but table length is {}.'.format(
exp_res, len(results_table),
))
else:
rtf = None
else:
results_table = rtf = None
return results, results_table, rtf
Step 3: Example usage
class FunTest1(DemoTest):
#staticmethod
def test_pass_1():
pass
#staticmethod
def test_fail_1():
assert False, 'Meant to fail for demo 1'
class FunTest2(DemoTest):
#staticmethod
def test_pass_2():
pass
#staticmethod
def test_fail_2():
assert False, 'Meant to fail for demo 2'
res, tab, form = manage_tests([FunTest1, FunTest2])
print(form)
print('')
for r in res:
print(r)
for error in r.errors:
print(error[0])
print(error[1])
Sample results:
$ python unittest_metadata.py
Name Result Time Notes
ms
-------------------- ------ --------- ----------------------------------------
FunTest2.test_fail_2 FAIL 5.412e-02 AssertionError: Meant to fail for demo 2
FunTest1.test_fail_1 FAIL 1.118e-01 AssertionError: Meant to fail for demo 1
FunTest2.test_pass_2 pass 6.199e-03
FunTest1.test_pass_1 pass 6.914e-03
<unittest.runner.TextTestResult run=4 errors=0 failures=2>
Should work with python 2.7 or 3.7. You can add whatever columns you want to the table. You can add parameters and stuff to the table in setUp, tearDown, or even during the tests.
Warnings:
This solution accesses a protected attribute _tests of unittest.suite.TestSuite, which can have unexpected results. This specific implementation works as expected for me in python2.7 and python3.7, but slight variations on how the suite is built and interrogated can easily lead to strange things happening. I couldn't figure out a different way to extract references to the instances of my classes that unittest uses, though.

memoize to disk - python - persistent memoization

Is there a way to memoize the output of a function to disk?
I have a function
def getHtmlOfUrl(url):
... # expensive computation
and would like to do something like:
def getHtmlMemoized(url) = memoizeToFile(getHtmlOfUrl, "file.dat")
and then call getHtmlMemoized(url), so as to do the expensive computation only once for each url.
Python offers a very elegant way to do this - decorators. Basically, a decorator is a function that wraps another function to provide additional functionality without changing the function source code. Your decorator can be written like this:
import json
def persist_to_file(file_name):
def decorator(original_func):
try:
cache = json.load(open(file_name, 'r'))
except (IOError, ValueError):
cache = {}
def new_func(param):
if param not in cache:
cache[param] = original_func(param)
json.dump(cache, open(file_name, 'w'))
return cache[param]
return new_func
return decorator
Once you've got that, 'decorate' the function using #-syntax and you're ready.
#persist_to_file('cache.dat')
def html_of_url(url):
your function code...
Note that this decorator is intentionally simplified and may not work for every situation, for example, when the source function accepts or returns data that cannot be json-serialized.
More on decorators: How to make a chain of function decorators?
And here's how to make the decorator save the cache just once, at exit time:
import json, atexit
def persist_to_file(file_name):
try:
cache = json.load(open(file_name, 'r'))
except (IOError, ValueError):
cache = {}
atexit.register(lambda: json.dump(cache, open(file_name, 'w')))
def decorator(func):
def new_func(param):
if param not in cache:
cache[param] = func(param)
return cache[param]
return new_func
return decorator
Check out joblib.Memory. It's a library for doing exactly that.
from joblib import Memory
memory = Memory("cachedir")
#memory.cache
def f(x):
print('Running f(%s)' % x)
return x
A cleaner solution powered by Python's Shelve module. The advantage is the cache gets updated in real time via well-known dict syntax, also it's exception proof(no need to handle annoying KeyError).
import shelve
def shelve_it(file_name):
d = shelve.open(file_name)
def decorator(func):
def new_func(param):
if param not in d:
d[param] = func(param)
return d[param]
return new_func
return decorator
#shelve_it('cache.shelve')
def expensive_funcion(param):
pass
This will facilitate the function to be computed just once. Next subsequent calls will return the stored result.
There is also diskcache.
from diskcache import Cache
cache = Cache("cachedir")
#cache.memoize()
def f(x, y):
print('Running f({}, {})'.format(x, y))
return x, y
The Artemis library has a module for this. (you'll need to pip install artemis-ml)
You decorate your function:
from artemis.fileman.disk_memoize import memoize_to_disk
#memoize_to_disk
def fcn(a, b, c = None):
results = ...
return results
Internally, it makes a hash out of input arguments and saves memo-files by this hash.
Check out Cachier. It supports additional cache configuration parameters like TTL etc.
Simple example:
from cachier import cachier
import datetime
#cachier(stale_after=datetime.timedelta(days=3))
def foo(arg1, arg2):
"""foo now has a persistent cache, trigerring recalculation for values stored more than 3 days."""
return {'arg1': arg1, 'arg2': arg2}
Something like this should do:
import json
class Memoize(object):
def __init__(self, func):
self.func = func
self.memo = {}
def load_memo(filename):
with open(filename) as f:
self.memo.update(json.load(f))
def save_memo(filename):
with open(filename, 'w') as f:
json.dump(self.memo, f)
def __call__(self, *args):
if not args in self.memo:
self.memo[args] = self.func(*args)
return self.memo[args]
Basic usage:
your_mem_func = Memoize(your_func)
your_mem_func.load_memo('yourdata.json')
# do your stuff with your_mem_func
If you want to write your "cache" to a file after using it -- to be loaded again in the future:
your_mem_func.save_memo('yournewdata.json')
Assuming that you data is json serializable, this code should work
import os, json
def json_file(fname):
def decorator(function):
def wrapper(*args, **kwargs):
if os.path.isfile(fname):
with open(fname, 'r') as f:
ret = json.load(f)
else:
with open(fname, 'w') as f:
ret = function(*args, **kwargs)
json.dump(ret, f)
return ret
return wrapper
return decorator
decorate getHtmlOfUrl and then simply call it, if it had been run previously, you will get your cached data.
Checked with python 2.x and python 3.x
You can use the cache_to_disk package:
from cache_to_disk import cache_to_disk
#cache_to_disk(3)
def my_func(a, b, c, d=None):
results = ...
return results
This will cache the results for 3 days, specific to the arguments a, b, c and d. The results are stored in a pickle file on your machine, and unpickled and returned next time the function is called. After 3 days, the pickle file is deleted until the function is re-run. The function will be re-run whenever the function is called with new arguments. More info here: https://github.com/sarenehan/cache_to_disk
Most answers are in a decorator fashion. But maybe I don't want to cache the result every time when calling the function.
I made one solution using context manager, so the function can be called as
with DiskCacher('cache_id', myfunc) as myfunc2:
res=myfunc2(...)
when you need the caching functionality.
The 'cache_id' string is used to distinguish data files, which are named [calling_script]_[cache_id].dat. So if you are doing this in a loop, will need to incorporate the looping variable into this cache_id, otherwise data will be overwritten.
Alternatively:
myfunc2=DiskCacher('cache_id')(myfunc)
res=myfunc2(...)
Alternatively (this is probably not quite useful as the same id is used all time time):
#DiskCacher('cache_id')
def myfunc(*args):
...
The complete code with examples (I'm using pickle to save/load, but can be changed to whatever save/read methods. NOTE that this is also assuming the function in question returns only 1 return value):
from __future__ import print_function
import sys, os
import functools
def formFilename(folder, varid):
'''Compose abspath for cache file
Args:
folder (str): cache folder path.
varid (str): variable id to form file name and used as variable id.
Returns:
abpath (str): abspath for cache file, which is using the <folder>
as folder. The file name is the format:
[script_file]_[varid].dat
'''
script_file=os.path.splitext(sys.argv[0])[0]
name='[%s]_[%s].nc' %(script_file, varid)
abpath=os.path.join(folder, name)
return abpath
def readCache(folder, varid, verbose=True):
'''Read cached data
Args:
folder (str): cache folder path.
varid (str): variable id.
Keyword Args:
verbose (bool): whether to print some text info.
Returns:
results (tuple): a tuple containing data read in from cached file(s).
'''
import pickle
abpath_in=formFilename(folder, varid)
if os.path.exists(abpath_in):
if verbose:
print('\n# <readCache>: Read in variable', varid,
'from disk cache:\n', abpath_in)
with open(abpath_in, 'rb') as fin:
results=pickle.load(fin)
return results
def writeCache(results, folder, varid, verbose=True):
'''Write data to disk cache
Args:
results (tuple): a tuple containing data read to cache.
folder (str): cache folder path.
varid (str): variable id.
Keyword Args:
verbose (bool): whether to print some text info.
'''
import pickle
abpath_out=formFilename(folder, varid)
if verbose:
print('\n# <writeCache>: Saving output to:\n',abpath_out)
with open(abpath_out, 'wb') as fout:
pickle.dump(results, fout)
return
class DiskCacher(object):
def __init__(self, varid, func=None, folder=None, overwrite=False,
verbose=True):
'''Disk cache context manager
Args:
varid (str): string id used to save cache.
function <func> is assumed to return only 1 return value.
Keyword Args:
func (callable): function object whose return values are to be
cached.
folder (str or None): cache folder path. If None, use a default.
overwrite (bool): whether to force a new computation or not.
verbose (bool): whether to print some text info.
'''
if folder is None:
self.folder='/tmp/cache/'
else:
self.folder=folder
self.func=func
self.varid=varid
self.overwrite=overwrite
self.verbose=verbose
def __enter__(self):
if self.func is None:
raise Exception("Need to provide a callable function to __init__() when used as context manager.")
return _Cache2Disk(self.func, self.varid, self.folder,
self.overwrite, self.verbose)
def __exit__(self, type, value, traceback):
return
def __call__(self, func=None):
_func=func or self.func
return _Cache2Disk(_func, self.varid, self.folder, self.overwrite,
self.verbose)
def _Cache2Disk(func, varid, folder, overwrite, verbose):
'''Inner decorator function
Args:
func (callable): function object whose return values are to be
cached.
varid (str): variable id.
folder (str): cache folder path.
overwrite (bool): whether to force a new computation or not.
verbose (bool): whether to print some text info.
Returns:
decorated function: if cache exists, the function is <readCache>
which will read cached data from disk. If needs to recompute,
the function is wrapped that the return values are saved to disk
before returning.
'''
def decorator_func(func):
abpath_in=formFilename(folder, varid)
#functools.wraps(func)
def wrapper(*args, **kwargs):
if os.path.exists(abpath_in) and not overwrite:
results=readCache(folder, varid, verbose)
else:
results=func(*args, **kwargs)
if not os.path.exists(folder):
os.makedirs(folder)
writeCache(results, folder, varid, verbose)
return results
return wrapper
return decorator_func(func)
if __name__=='__main__':
data=range(10) # dummy data
#--------------Use as context manager--------------
def func1(data, n):
'''dummy function'''
results=[i*n for i in data]
return results
print('\n### Context manager, 1st time call')
with DiskCacher('context_mananger', func1) as func1b:
res=func1b(data, 10)
print('res =', res)
print('\n### Context manager, 2nd time call')
with DiskCacher('context_mananger', func1) as func1b:
res=func1b(data, 10)
print('res =', res)
print('\n### Context manager, 3rd time call with overwrite=True')
with DiskCacher('context_mananger', func1, overwrite=True) as func1b:
res=func1b(data, 10)
print('res =', res)
#--------------Return a new function--------------
def func2(data, n):
results=[i*n for i in data]
return results
print('\n### Wrap a new function, 1st time call')
func2b=DiskCacher('new_func')(func2)
res=func2b(data, 10)
print('res =', res)
print('\n### Wrap a new function, 2nd time call')
res=func2b(data, 10)
print('res =', res)
#----Decorate a function using the syntax sugar----
#DiskCacher('pie_dec')
def func3(data, n):
results=[i*n for i in data]
return results
print('\n### pie decorator, 1st time call')
res=func3(data, 10)
print('res =', res)
print('\n### pie decorator, 2nd time call.')
res=func3(data, 10)
print('res =', res)
The outputs:
### Context manager, 1st time call
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[context_mananger].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Context manager, 2nd time call
# <readCache>: Read in variable context_mananger from disk cache:
/tmp/cache/[diskcache]_[context_mananger].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Context manager, 3rd time call with overwrite=True
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[context_mananger].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Wrap a new function, 1st time call
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[new_func].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Wrap a new function, 2nd time call
# <readCache>: Read in variable new_func from disk cache:
/tmp/cache/[diskcache]_[new_func].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### pie decorator, 1st time call
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[pie_dec].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### pie decorator, 2nd time call.
# <readCache>: Read in variable pie_dec from disk cache:
/tmp/cache/[diskcache]_[pie_dec].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
Here's a solution I came up with which can:
memoize mutable objects (memoized functions should have no side effects that change mutable parameters or it won't work as expected)
writes to a separate cache file for each wrapped function (easy to delete the file to purge that particular cache)
compresses the data to make it much smaller on disk (a LOT smaller)
It will create cache files like:
cache.__main__.function.getApiCall.db
cache.myModule.function.fixDateFormat.db
cache.myOtherModule.function.getOtherApiCall.db
Here's the code. You can choose a compression library of your choosing, but I've found LZMA works best for the pickle storage we are using.
import dbm
import hashlib
import pickle
# import bz2
import lzma
# COMPRESSION = bz2
COMPRESSION = lzma # better with pickle compression
# Create a #memoize_to_disk decorator to cache a memoize to disk cache
def memoize_to_disk(function, cache_filename=None):
uniqueFunctionSignature = f'cache.{function.__module__}.{function.__class__.__name__}.{function.__name__}'
if cache_filename is None:
cache_filename = uniqueFunctionSignature
# print(f'Caching to {cache_file}')
def wrapper(*args, **kwargs):
# Convert the dictionary into a JSON object (can't memoize mutable fields, this gives us an immutable, hashable function signature)
if cache_filename == uniqueFunctionSignature:
# Cache file is function-specific, so don't include function name in params
params = {'args': args, 'kwargs': kwargs}
else:
# add module.class.function name to params so no collisions occur if user overrides cache_file with the same cache for multiple functions
params = {'function': uniqueFunctionSignature, 'args': args, 'kwargs': kwargs}
# key hash of the json representation of the function signature (to avoid immutable dictionary errors)
params_json = json.dumps(params)
key = hashlib.sha256(params_json.encode("utf-8")).hexdigest() # store hash of key
# Get cache entry or create it if not found
with dbm.open(cache_filename, 'c') as db:
# Try to retrieve the result from the cache
try:
result = pickle.loads(COMPRESSION.decompress(db[key]))
# print(f'CACHE HIT: Found {key[1:100]=} in {cache_file=} with value {str(result)[0:100]=}')
return result
except KeyError:
# If the result is not in the cache, call the function and store the result
result = function(*args, **kwargs)
db[key] = COMPRESSION.compress(pickle.dumps(result))
# print(f'CACHE MISS: Stored {key[1:100]=} in {cache_file=} with value {str(result)[0:100]=}')
return result
return wrapper
To use the code, use the #memoize_to_disk decorator (with an optional filename parameter if you don't like "cache." as a prefix)
#memoize_to_disk
def expensive_example(n):
// expensive operation goes here
return value

Categories

Resources