I have a large program (simulation of a business process) where I use a separate process to write finished parts to a file. The problem is that when writing the data to the file, the program still keeps the data in memory even if I explicitly tell it to delete it. If I run the same program with only the line for writing into file commented, it won't keep any of the data in memory.
Do I do something wrong with freeing the memory?
This is the part of the code responsible for writing it to file:
class CSVCustomWriter:
def __init__(self, simulation_id, csv_delimiter, attribute_names):
self.csv_delimiter = csv_delimiter
self.simulation_id = simulation_id
self.attribute_names = attribute_names
self.header = ['CaseId', 'Activity', 'Start', 'End', 'Resource',
'CostPerEvent', 'CostPerResource'] + attribute_names
self.base_path = os.path.join('results', str(self.simulation_id))
if not os.path.exists(self.base_path):
os.makedirs(self.base_path)
self.path_csv = os.path.join(self.base_path, 'data.csv')
self.file = open(self.path_csv, 'wt', encoding='utf-8')
self.file.write(self.csv_delimiter.join(self.header) + '\n')
self.closed = False
def add(self, data):
buffer = ''
case_id, previous_activities, attributes = data
current_attributes = tuple(attributes[a] for a in self.attribute_names)
for a in previous_activities:
buffer += self.csv_delimiter.join(
(str(case_id), str(a[0]), str(a[1]), str(a[2]), str(a[6]), str(a[4]), str(a[5]))
+ current_attributes)
buffer += '\n'
self.file.write(buffer)
del current_attributes
del case_id
del previous_activities
del attributes
del buffer
def close(self):
if not self.closed:
self.file.close()
self.closed = True
After each case ends, its sent using the add method to be written to the file. If I remove self.file.write(buffer) line, the usage of ram for this part of the program drops to nearly zero.
Related
I want to parse a large wikipedia dump iteratively. I found a tutorial for this here: https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c
However, when I want to read in the data like this:
data_path = 'C:\\Users\\Me\\datasets\\dewiki-latest-pages-articles1.xml-p1p262468.bz2'
import xml.sax
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def characters(self, content):
"""Characters between opening and closing tags"""
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
"""Opening tag of element"""
if name in ('title', 'text'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
"""Closing tag of element"""
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self._pages.append((self._values['title'], self._values['text']))
# Object for handling xml
handler = WikiXmlHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
# Iteratively process file
for line in subprocess.Popen(['bzcat'],
stdin = open(data_path),
stdout = subprocess.PIPE,shell=True).stdout:
parser.feed(line)
# Stop when 3 articles have been found
if len(handler._pages) > 3:
break
it seems like nothing happens. The handler._pages list is empty. This is where the parsed articles should be stored. I also added shell=True because otherwise I get the error message FileNotFoundError: [WinError 2].
I never worked with subprocesses in python so I don't know what the problem might be.
I also tried to specify the data_path differently (with / and //).
Thank you in advance.
In my main file, I call the following function to write same data to a binary file:
main.py
writeOutputFile(param1, param2, param3)
In file_a.writeOutputFile I open my output file in a with statement
and call the function file_b.writeReference:
file_a.py
#singleton
class BitstreamToFile:
def __init__(self, outfile):
self.outfile = outfile
self.cache = ''
def add(self, data, length):
s = ''
if (type(data) == str):
log.info("string %s \n "%data)
for char in data:
b = bin(ord(char))[2:]
s = s + "{:0>8}".format(b)
else:
s = bin(data)[2:]
if (len(s) < length):
resto = length - len(s)
for _ in range(0, resto):
s = '0' + s
s = s[0:length]
self.cache = self.cache + s
self.flush()
def writeByteToFile(self):
if (len(self.cache) < 8):
raise ("Not enough bits to make a byte ")
data = int(self.cache[:8], 2)
log.info("writeByteToFile %s " % data)
self.outfile.write(struct.pack('>B', data))
self.cache = self.cache[8:]
def flush(self, padding=False):
while (len(self.cache) >= 8):
log.info("BEF flush len(self.cache) %s"%len(self.cache))
self.writeByteToFile()
log.info("AFT flush len(self.cache) %s"%len(self.cache))
if (padding):
self.cache = "{:0<8}".format(self.cache)
self.writeByteToFile()
def writeOutputFile(param1, param2, param3):
[..]
with open(OUTPUT_FILE, 'wb') as out_file:
writeReference(out_file, param2, param1)
In file_B.writeReference I instantiate my BitstreamToFile object
file_b.py
def writeReference(out_file, param2, param1):
bitstream = file_a.BitstreamToFile(file)
log.debug ("write key && length")
bitstream.add("akey", 32)
bitstream.add(0, 64)
[..]
When I compile and execute the first time, I get no error. The second time instead I get:
# log from `file_B.writeReference`
write key && length
# log from file_a.bitstream.flush
BEF flush len(self.cache) 32
#log from file_a.bitstream.writeByteToFile
writeByteToFile 114
then the code crashes:
Exception on /encode [POST]
[..]
File "/src/file_a.py", line 83, in flush
self.writeByteToFile()
File "/src/file_a.py", line 73, in writeByteToFile
self.outfile.write(struct.pack('>B', data))
ValueError: write to closed file
"POST /encode HTTP/1.1" 500 -
Any hints on where the error might be? I do not really understand why sometimes it works, sometimes it does not.
Thank you in advance
Not an answer.
diagnostic tool:
Subclass io.FileIO; override the __enter__ and __exit__ methods adding logging so you can see when the context manager enters and exits (file closed?). Maybe add more logging to other parts of the program for finer grained time-history. Do some test runs with a fake file or even something more isolated from your real stuff (I say this mainly because I don't know the consequences of using the subclass so you should be careful). Here is an example:
import io
class B(io.FileIO):
def __enter__(self):
print(f'\tcontext manager entry - file:{self.name}')
return super().__enter__()
def __exit__(self,*args,**kwargs):
print(f'\tcontext manager exiting - file:{self.name}')
super().__exit__(self,*args,**kwargs)
In [32]: with B('1.txt','wb') as f:
...: f.write(b'222')
...:
context manager entry - file:1.txt
context manager exiting - file:1.txt
In [33]:
The issue is related to the docker container which handles the code I shared above.
I'm a newby in Docker, and so I was using the following command to build up my containers (I have 3 micro-services):
$docker-compose up -d --build
without knowing that, if my container is not RE-created (no changes in the source code), the second time I re-run the previously stopped container where my file was closed at the end.
If I force container to be recreated (in the case I do not need to change the source code):
$docker-compose up -d --build --force-recreate
I have no more errors.
I have millions of domains which I will send WHOIS query and record WHOIS response on some .txt file.
I would like to set maximum capacity for a single .txt output file. For example, let's say I started recording responses on out0.txt. I want to switch to out1.txt if out0.txt is >= 100mb. Same thing goes for out1.txt, if out1.txt>=100mb then start writing to out2.txtand so on.
I know that I can do if checks after each insertion, but I want my code to be fast: i.e. I thought if checks at each domain can slow down my code. (It will asynchronously query millions of domains).
I imagined a try-except block could solve my issue here, like this:
folder_name = "out%s.txt"
folder_number = 0
folder_name = folder_name % folder_number
f = open(folder_name, 'w+')
for domain in millions_of_domains:
try:
response_json = send_whois_query(domain)
f.write(response_json)
except FileGreaterThan100MbException:
folder_number += 1
folder_name = folder_name % folder_number
f = open(folder_name, 'w+')
f.write(response_json)
Any suggestions will be appreciated. Thank you for your time.
You can create a wrapper object that tracks how much data has been written, and opens a new file if you reached a limit:
class MaxSizeFileWriter(object):
def __init__(self, filenamepattern, maxdata=2**20, # default 1Mb
start=0, mode='w', *args, **kwargs):
self._pattern = filenamepattern
self._counter = start
self._mode = mode
self._args, self._kwargs = args, kwargs
self._max = maxdata
self._openfile = None
self._written = 0
def _open(self):
if self._openfile is not None:
filename = self._pattern.format(self._counter)
self._counter += 1
self._openfile = open(filename, mode=self._mode, *self._args, **self._kwargs)
def _close(self):
if self._openfile is not None:
self._openfile.close()
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
if self._openfile is not None:
self._openfile.close()
def write(self, data):
if self._written + len(data) > self._max:
# current file too full to fit data too, close it
# This will trigger a new file to be opened.
self._close()
self._open() # noop if already open
self._openfile.write(data)
self._written += len(data)
The above is a context manager, and can be used just like a regular file. Pass in a filename with a {} placeholder for the number to be inserted into:
folder_name = "out{}.txt"
with MaxSizeFileWriter(folder_name, maxdata=100 * 2**10) as f:
for domain in millions_of_domains:
response_json = send_whois_query(domain)
f.write(response_json)
I'm pulling commit data from the Gerrit API, and the commit number is in the 226,000 range. Where I have to make a request to an endpoint for each and every commit, this is understandable taking a long time. I was wondering how I could best implement threading into my current process.
I have two classes, a Project class, which drills down and retrieves all commits associated with it, and saves them out as a Commit object that contains all the information necessary to then loop through and get the json associated with it. I am pulling them all into a big list, and then iterating through to call the get_data and write_data methods.
class Project(object):
def __init__(self, name):
self.name = name
self.commits = []
def add_commits(self, changes_list):
for change in changes_list:
change_id=change['change_id'],
revision_list=change['revisions']
self.commits.extend([Commit(rid, change_id)
for rid in revision_list.keys()])
def return_results(self, ger_obj, start=0):
self.ger = ger_obj
while True:
endpoint = (r'/changes/?q=project:{project}&o=ALL_REVISIONS&'
r'S={num}'.format(
project=self.name,
num=start
))
logging.info('Endpoint: {}'.format(endpoint))
try:
changes = ger_obj.get(endpoint)
self.add_commits(changes_list=changes)
except HTTPError:
break
start += 500
try:
if not changes[-1].get('_more_changes'):
break
except IndexError:
break
class Commit(object):
def __init__(self, rev_id, change_id):
self.rev_id = rev_id
self.change_id = change_id
def get_data(self, ger_obj):
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id[0],
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError as e:
logging.warning('Endpoint: {} did not return data'.format(
endpoint
))
else:
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = ETL_PDI_IPADDRESS
self.data['message'] = self.data['message'].replace('\n', ' ').replace('|', '[pipe]')
def write_data(self, writer):
writer.writerow(self.data)
I'm thinking that the best place to implement the threads is once I have all the commits in a list and am ready to iterate over them:
projects = [Project(value['id']) for value in project_data.values()]
for project in projects[:10]:
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest)
all_commits.extend(project.commits)
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
REDSHIFT_POSTGRES_INFO)
with open('testfile.csv', 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
# Implement Threading?
for commit in all_commits:
commit.get_data(rest)
try:
commit.write_data(writer=writer)
except AttributeError:
continue
except Exception:
print commit.data, 'caused an exception.'
continue
I've read a few threading tutorials, and am unsure as to how to properly do this. I'm particularly worried about overwriting data due to improper locking.
I have written an async file upload RequestHandler. It is correct byte-wise, that is the files I receive are identical to the ones being sent. One issue that I am having trouble figuring out is upload delay. Specifically when I issue the post request to upload the file while testing locally I see the browser showing upload progress get stuck. For files close to 4MB in size it gets stuck on 50%+ for a little while then some time passes and it sends all of the data, and gets stuck on "waiting for localhost..." The whole process may last 3+ minutes.
The kicker is when I add print statements that end with a new line to data_received method the delays disappear. Does the print statement trigger the network buffers to be flushed somehow?
Here is the implementation of data_received, along with the helper methods:
#tornado.gen.coroutine
def _read_data(self, cont_buf):
'''
Read the file data.
#param cont_buf - buffered HTTP request
#param boolean indicating whether data is still being read and new
buffer
'''
# Check only last characters of the buffer guaranteed to be large
# enough to contain the boundary
end_of_data_idx = cont_buf.find(self._boundary)
if end_of_data_idx >= 0:
data = cont_buf[:(end_of_data_idx - self.LSEP)]
self.receive_data(self.header_list[-1], data)
new_buffer = cont_buf[(end_of_data_idx + len(self._boundary)):]
return False, new_buffer
else:
self.receive_data(self.header_list[-1], cont_buf)
return True, b""
#tornado.gen.coroutine
def _parse_params(self, param_buf):
'''
Parse HTTP header parameters.
#param param_buf - string buffer containing the parameters.
#returns parameters dictionary
'''
params = dict()
param_res = self.PAT_HEADERPARAMS.findall(param_buf)
if param_res:
for name, value in param_res:
params[name] = value
elif param_buf:
params['value'] = param_buf
return params
#tornado.gen.coroutine
def _parse_header(self, header_buf):
'''
Parses a buffer containing an individual header with parameters.
#param header_buf - header buffer containing a single header
#returns header dictionary
'''
res = self.PAT_HEADERVALUE.match(header_buf)
header = dict()
if res:
name, value, tail = res.groups()
header = {'name': name, 'value': value,
'params': (yield self._parse_params(tail))}
elif header_buf:
header = {"value": header_buf}
return header
#tornado.gen.coroutine
def data_received(self, chunk):
'''
Processes a chunk of content body.
#param chunk - a piece of content body.
'''
self._count += len(chunk)
self._buffer += chunk
# Has boundary been established?
if not self._boundary:
self._boundary, self._buffer =\
(yield self._extract_boundary(self._buffer))
if (not self._boundary
and len(self._buffer) > self.BOUNDARY_SEARCH_BUF_SIZE):
raise RuntimeError("Cannot find multipart delimiter.")
while True:
if self._receiving_data:
self._receiving_data, self._buffer = yield self._read_data(self._buffer)
if self._is_end_of_request(self._buffer):
yield self.request_done()
break
elif self._is_end_of_data(self._buffer):
break
else:
headers, self._buffer = yield self._read_headers(self._buffer)
if headers:
self.header_list.append(headers)
self._receiving_data = True
else:
break