writing to files: switch to new file after X MB file capacity - python

I have millions of domains which I will send WHOIS query and record WHOIS response on some .txt file.
I would like to set maximum capacity for a single .txt output file. For example, let's say I started recording responses on out0.txt. I want to switch to out1.txt if out0.txt is >= 100mb. Same thing goes for out1.txt, if out1.txt>=100mb then start writing to out2.txtand so on.
I know that I can do if checks after each insertion, but I want my code to be fast: i.e. I thought if checks at each domain can slow down my code. (It will asynchronously query millions of domains).
I imagined a try-except block could solve my issue here, like this:
folder_name = "out%s.txt"
folder_number = 0
folder_name = folder_name % folder_number
f = open(folder_name, 'w+')
for domain in millions_of_domains:
try:
response_json = send_whois_query(domain)
f.write(response_json)
except FileGreaterThan100MbException:
folder_number += 1
folder_name = folder_name % folder_number
f = open(folder_name, 'w+')
f.write(response_json)
Any suggestions will be appreciated. Thank you for your time.

You can create a wrapper object that tracks how much data has been written, and opens a new file if you reached a limit:
class MaxSizeFileWriter(object):
def __init__(self, filenamepattern, maxdata=2**20, # default 1Mb
start=0, mode='w', *args, **kwargs):
self._pattern = filenamepattern
self._counter = start
self._mode = mode
self._args, self._kwargs = args, kwargs
self._max = maxdata
self._openfile = None
self._written = 0
def _open(self):
if self._openfile is not None:
filename = self._pattern.format(self._counter)
self._counter += 1
self._openfile = open(filename, mode=self._mode, *self._args, **self._kwargs)
def _close(self):
if self._openfile is not None:
self._openfile.close()
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
if self._openfile is not None:
self._openfile.close()
def write(self, data):
if self._written + len(data) > self._max:
# current file too full to fit data too, close it
# This will trigger a new file to be opened.
self._close()
self._open() # noop if already open
self._openfile.write(data)
self._written += len(data)
The above is a context manager, and can be used just like a regular file. Pass in a filename with a {} placeholder for the number to be inserted into:
folder_name = "out{}.txt"
with MaxSizeFileWriter(folder_name, maxdata=100 * 2**10) as f:
for domain in millions_of_domains:
response_json = send_whois_query(domain)
f.write(response_json)

Related

Python writing to file keeps memory allocated

I have a large program (simulation of a business process) where I use a separate process to write finished parts to a file. The problem is that when writing the data to the file, the program still keeps the data in memory even if I explicitly tell it to delete it. If I run the same program with only the line for writing into file commented, it won't keep any of the data in memory.
Do I do something wrong with freeing the memory?
This is the part of the code responsible for writing it to file:
class CSVCustomWriter:
def __init__(self, simulation_id, csv_delimiter, attribute_names):
self.csv_delimiter = csv_delimiter
self.simulation_id = simulation_id
self.attribute_names = attribute_names
self.header = ['CaseId', 'Activity', 'Start', 'End', 'Resource',
'CostPerEvent', 'CostPerResource'] + attribute_names
self.base_path = os.path.join('results', str(self.simulation_id))
if not os.path.exists(self.base_path):
os.makedirs(self.base_path)
self.path_csv = os.path.join(self.base_path, 'data.csv')
self.file = open(self.path_csv, 'wt', encoding='utf-8')
self.file.write(self.csv_delimiter.join(self.header) + '\n')
self.closed = False
def add(self, data):
buffer = ''
case_id, previous_activities, attributes = data
current_attributes = tuple(attributes[a] for a in self.attribute_names)
for a in previous_activities:
buffer += self.csv_delimiter.join(
(str(case_id), str(a[0]), str(a[1]), str(a[2]), str(a[6]), str(a[4]), str(a[5]))
+ current_attributes)
buffer += '\n'
self.file.write(buffer)
del current_attributes
del case_id
del previous_activities
del attributes
del buffer
def close(self):
if not self.closed:
self.file.close()
self.closed = True
After each case ends, its sent using the add method to be written to the file. If I remove self.file.write(buffer) line, the usage of ram for this part of the program drops to nearly zero.

Luigi task not writing pandas df to csv

I have the following code to simply an excel file and return only the required columns. It written as a luigi task containerized on docker and its not returning the csv file while _SUCCESS flag is being created.
Function Code:
def _save_datasets(simplified, outdir: Path, flag):
out_clean = outdir / 'transformed.csv/'
flag = outdir / flag
simplified.to_csv(str(out_clean), index=False)
# save as csv and create flag file
flag.touch()
#click.command()
#click.option('--in-csv')
#click.option('--out-dir')
#click.option('--flag')
def transform_data(in_csv,out_dir, flag):
out_dir = Path(out_dir)
data=pd.read_csv(in_csv)
req_dp = data[['description','points']]
#simplifying the points according to range
def transform_points_simplified(points):
if points < 84:
return 1
elif points >= 84 and points < 88:
return 2
elif points >= 88 and points < 92:
return 3
elif points >= 92 and points < 96:
return 4
else:
return 5
simplified = req_dp.assign(points_simplified = dp['points'].apply(transform_points_simplified))
_save_datasets(simplified,out_dir, flag)
Luigi Task code:
#Transform
class TransformData(DockerTask):
"""Task to simplify datasets"""
in_path = '/usr/share/data/created_csv/'
in_csv = luigi.Parameter(default= in_path + 'cleaned.csv')
out_dir = luigi.Parameter(default='/usr/share/data/created_csv/')
flag = luigi.Parameter('.SUCCESS_TransformData')
#property
def image(self):
return f'code-chal/transform-data:{VERSION}'
def requires(self):
return CleanData()
#property
def command(self):
return [
'python', 'clean_data.py',
'--in-csv', self.in_csv,
'--out-dir', self.out_dir,
'--flag', self.flag
]
def output(self):
return luigi.LocalTarget(
path=str(Path(self.out_dir) / self.flag)
)
The luigi task moves on to the next task due to the creation of _SUCCESS flag, but the next task fails since its dependent on the transformed.csv file which isn't being created.
Thanks
In your LuigiTask you need a run function that needs to save the file you want using the output target from the output function.
So you need to add:
def run(self):
outfile = open(self.output().path, 'wb') # Notice that it references to the path of the self.output function
transform_data(self.in_csv, outfile, self.flag)

Iteratively process large wikipedia dump

I want to parse a large wikipedia dump iteratively. I found a tutorial for this here: https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c
However, when I want to read in the data like this:
data_path = 'C:\\Users\\Me\\datasets\\dewiki-latest-pages-articles1.xml-p1p262468.bz2'
import xml.sax
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def characters(self, content):
"""Characters between opening and closing tags"""
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
"""Opening tag of element"""
if name in ('title', 'text'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
"""Closing tag of element"""
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self._pages.append((self._values['title'], self._values['text']))
# Object for handling xml
handler = WikiXmlHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
# Iteratively process file
for line in subprocess.Popen(['bzcat'],
stdin = open(data_path),
stdout = subprocess.PIPE,shell=True).stdout:
parser.feed(line)
# Stop when 3 articles have been found
if len(handler._pages) > 3:
break
it seems like nothing happens. The handler._pages list is empty. This is where the parsed articles should be stored. I also added shell=True because otherwise I get the error message FileNotFoundError: [WinError 2].
I never worked with subprocesses in python so I don't know what the problem might be.
I also tried to specify the data_path differently (with / and //).
Thank you in advance.

How to properly implement threading while writing out to a csv?

I'm pulling commit data from the Gerrit API, and the commit number is in the 226,000 range. Where I have to make a request to an endpoint for each and every commit, this is understandable taking a long time. I was wondering how I could best implement threading into my current process.
I have two classes, a Project class, which drills down and retrieves all commits associated with it, and saves them out as a Commit object that contains all the information necessary to then loop through and get the json associated with it. I am pulling them all into a big list, and then iterating through to call the get_data and write_data methods.
class Project(object):
def __init__(self, name):
self.name = name
self.commits = []
def add_commits(self, changes_list):
for change in changes_list:
change_id=change['change_id'],
revision_list=change['revisions']
self.commits.extend([Commit(rid, change_id)
for rid in revision_list.keys()])
def return_results(self, ger_obj, start=0):
self.ger = ger_obj
while True:
endpoint = (r'/changes/?q=project:{project}&o=ALL_REVISIONS&'
r'S={num}'.format(
project=self.name,
num=start
))
logging.info('Endpoint: {}'.format(endpoint))
try:
changes = ger_obj.get(endpoint)
self.add_commits(changes_list=changes)
except HTTPError:
break
start += 500
try:
if not changes[-1].get('_more_changes'):
break
except IndexError:
break
class Commit(object):
def __init__(self, rev_id, change_id):
self.rev_id = rev_id
self.change_id = change_id
def get_data(self, ger_obj):
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id[0],
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError as e:
logging.warning('Endpoint: {} did not return data'.format(
endpoint
))
else:
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = ETL_PDI_IPADDRESS
self.data['message'] = self.data['message'].replace('\n', ' ').replace('|', '[pipe]')
def write_data(self, writer):
writer.writerow(self.data)
I'm thinking that the best place to implement the threads is once I have all the commits in a list and am ready to iterate over them:
projects = [Project(value['id']) for value in project_data.values()]
for project in projects[:10]:
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest)
all_commits.extend(project.commits)
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
REDSHIFT_POSTGRES_INFO)
with open('testfile.csv', 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
# Implement Threading?
for commit in all_commits:
commit.get_data(rest)
try:
commit.write_data(writer=writer)
except AttributeError:
continue
except Exception:
print commit.data, 'caused an exception.'
continue
I've read a few threading tutorials, and am unsure as to how to properly do this. I'm particularly worried about overwriting data due to improper locking.

Converting an UNIX python program to work in windows [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I need to make a program that drives a DYMO LabelManager PnP label printing device. DYMO provides a SDK for this purpose, but after some desperate trying, I'd say the SDK is useless. Then I found a program which is just what I need, written by a guy named S.Bronner. But the problem is that his program is made for Python in UNIX, and I would need it to work in Windows with python. So I'm asking, is there anyone who could examine this code and convert it to work in windows for me? My Python skills are not good enough to accomplish this. Here is the code which should be converted:
#!/usr/bin/env python
DEV_CLASS = 3
DEV_VENDOR = 0x0922
DEV_PRODUCT = 0x1001
DEV_NODE = None
DEV_NAME = 'Dymo LabelManager PnP'
FONT_FILENAME = '/usr/share/fonts/truetype/ttf-bitstream-vera/Vera.ttf'
FONT_SIZERATIO = 7./8
import Image
import ImageDraw
import ImageFont
import array
import fcntl
import os
import re
import struct
import subprocess
import sys
import termios
import textwrap
class DymoLabeler:
"""
Create and work with a Dymo LabelManager PnP object.
This class contains both mid-level and high-level functions. In general,
the high-level functions should be used. However, special purpose usage
may require the mid-level functions. That is why they are provided.
However, they should be well understood before use. Look at the
high-level functions for help. Each function is marked in its docstring
with 'HLF' or 'MLF' in parentheses.
"""
def __init__(self, dev):
"""Initialize the LabelManager object. (HLF)"""
self.maxBytesPerLine = 8 # 64 pixels on a 12mm-tape
self.ESC = 0x1b
self.SYN = 0x16
self.cmd = []
self.rsp = False
self.bpl = None
self.dtb = 0
if not os.access(dev, os.R_OK | os.W_OK): return False
self.dev = open(dev, 'r+')
def sendCommand(self):
"""Send the already built command to the LabelManager. (MLF)"""
if len(self.cmd) == 0: return
cmdBin = array.array('B', self.cmd)
cmdBin.tofile(self.dev)
self.cmd = []
if not self.rsp: return
self.rsp = False
rspBin = self.dev.read(8)
rsp = array.array('B', rspBin).tolist()
return rsp
def resetCommand(self):
"""Remove a partially built command. (MLF)"""
self.cmd = []
self.rsp = False
def buildCommand(self, cmd):
"""Add the next instruction to the command. (MLF)"""
self.cmd += cmd
def statusRequest(self):
"""Set instruction to get the device's status. (MLF)"""
cmd = [self.ESC, ord('A')]
self.buildCommand(cmd)
self.rsp = True
def dotTab(self, value):
"""Set the bias text height, in bytes. (MLF)"""
if value < 0 or value > self.maxBytesPerLine: raise ValueError
cmd = [self.ESC, ord('B'), value]
self.buildCommand(cmd)
self.dtb = value
self.bpl = None
def tapeColor(self, value):
"""Set the tape color. (MLF)"""
if value < 0: raise ValueError
cmd = [self.ESC, ord('C'), value]
self.buildCommand(cmd)
def bytesPerLine(self, value):
"""Set the number of bytes sent in the following lines. (MLF)"""
if value < 0 or value + self.dtb > self.maxBytesPerLine: raise ValueError
if value == self.bpl: return
cmd = [self.ESC, ord('D'), value]
self.buildCommand(cmd)
self.bpl = value
def cut(self):
"""Set instruction to trigger cutting of the tape. (MLF)"""
cmd = [self.ESC, ord('E')]
self.buildCommand(cmd)
def line(self, value):
"""Set next printed line. (MLF)"""
self.bytesPerLine(len(value))
cmd = [self.SYN] + value
self.buildCommand(cmd)
def chainMark(self):
"""Set Chain Mark. (MLF)"""
self.dotTab(0)
self.bytesPerLine(self.maxBytesPerLine)
self.line([0x99] * self.maxBytesPerLine)
def skipLines(self, value):
"""Set number of lines of white to print. (MLF)"""
if value <= 0: raise ValueError
self.bytesPerLine(0)
cmd = [self.SYN] * value
self.buildCommand(cmd)
def initLabel(self):
"""Set the label initialization sequence. (MLF)"""
cmd = [0x00] * 8
self.buildCommand(cmd)
def getStatus(self):
"""Ask for and return the device's status. (HLF)"""
self.statusRequest()
rsp = self.sendCommand()
print rsp
def printLabel(self, lines, dotTab):
"""Print the label described by lines. (HLF)"""
self.initLabel
self.tapeColor(0)
self.dotTab(dotTab)
for line in lines:
self.line(line)
self.skipLines(56) # advance printed matter past cutter
self.skipLines(56) # add symmetric margin
self.statusRequest()
rsp = self.sendCommand()
print rsp
def die(message=None):
if message: print >> sys.stderr, message
sys.exit(1)
def pprint(par, fd=sys.stdout):
rows, columns = struct.unpack('HH', fcntl.ioctl(sys.stderr, termios.TIOCGWINSZ, struct.pack('HH', 0, 0)))
print >> fd, textwrap.fill(par, columns)
def getDeviceFile(classID, vendorID, productID):
# find file containing the device's major and minor numbers
searchdir = '/sys/bus/hid/devices'
pattern = '^%04d:%04X:%04X.[0-9A-F]{4}$' % (classID, vendorID, productID)
deviceCandidates = os.listdir(searchdir)
foundpath = None
for devname in deviceCandidates:
if re.match(pattern, devname):
foundpath = os.path.join(searchdir, devname)
break
if not foundpath: return
searchdir = os.path.join(foundpath, 'hidraw')
devname = os.listdir(searchdir)[0]
foundpath = os.path.join(searchdir, devname)
filepath = os.path.join(foundpath, 'dev')
# get the major and minor numbers
f = open(filepath, 'r')
devnums = [int(n) for n in f.readline().strip().split(':')]
f.close()
devnum = os.makedev(devnums[0], devnums[1])
# check if a symlink with the major and minor numbers is available
filepath = '/dev/char/%d:%d' % (devnums[0], devnums[1])
if os.path.exists(filepath):
return os.path.realpath(filepath)
# check if the relevant sysfs path component matches a file name in
# /dev, that has the proper major and minor numbers
filepath = os.path.join('/dev', devname)
if os.stat(filepath).st_rdev == devnum:
return filepath
# search for a device file with the proper major and minor numbers
for dirpath, dirnames, filenames in os.walk('/dev'):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
if os.stat(filepath).st_rdev == devnum:
return filepath
def access_error(dev):
pprint('You do not have sufficient access to the device file %s:' % dev, sys.stderr)
subprocess.call(['ls', '-l', dev], stdout=sys.stderr)
print >> sys.stderr
pprint('You probably want to add a rule in /etc/udev/rules.d along the following lines:', sys.stderr)
print >> sys.stderr, ' SUBSYSTEM=="hidraw", \\'
print >> sys.stderr, ' ACTION=="add", \\'
print >> sys.stderr, ' DEVPATH=="/devices/pci[0-9]*/usb[0-9]*/0003:0922:1001.*/hidraw/hidraw0", \\'
print >> sys.stderr, ' GROUP="plugdev"'
print >> sys.stderr
pprint('Following that, turn off your device and back on again to activate the new permissions.', sys.stderr)
# get device file name
if not DEV_NODE:
dev = getDeviceFile(DEV_CLASS, DEV_VENDOR, DEV_PRODUCT)
else:
dev = DEV_NODE
if not dev: die("The device '%s' could not be found on this system." % DEV_NAME)
# create dymo labeler object
lm = DymoLabeler(dev)
if not lm: die(access_error(dev))
# check for any text specified on the command line
labeltext = [arg.decode(sys.stdin.encoding) for arg in sys.argv[1:]]
if len(labeltext) == 0: die("No label text was specified.")
# create an empty label image
labelheight = lm.maxBytesPerLine * 8
lineheight = float(labelheight) / len(labeltext)
fontsize = int(round(lineheight * FONT_SIZERATIO))
font = ImageFont.truetype(FONT_FILENAME, fontsize)
labelwidth = max(font.getsize(line)[0] for line in labeltext)
labelbitmap = Image.new('1', (labelwidth, labelheight))
# write the text into the empty image
labeldraw = ImageDraw.Draw(labelbitmap)
for i, line in enumerate(labeltext):
lineposition = int(round(i * lineheight))
labeldraw.text((0, lineposition), line, font=font, fill=255)
del labeldraw
# convert the image to the proper matrix for the dymo labeler object
labelrotated = labelbitmap.transpose(Image.ROTATE_270)
labelstream = labelrotated.tostring()
labelstreamrowlength = labelheight/8 + (1 if labelheight%8 != 0 else 0)
if len(labelstream)/labelstreamrowlength != labelwidth: die('An internal problem was encountered while processing the label bitmap!')
labelrows = [labelstream[i:i+labelstreamrowlength] for i in range(0, len(labelstream), labelstreamrowlength)]
labelmatrix = [array.array('B', labelrow).tolist() for labelrow in labelrows]
# optimize the matrix for the dymo label printer
dottab = 0
while max(line[0] for line in labelmatrix) == 0:
labelmatrix = [line[1:] for line in labelmatrix]
dottab += 1
for line in labelmatrix:
while len(line) > 0 and line[-1] == 0:
del line[-1]
# print the label
lm.printLabel(labelmatrix, dottab)
FONT_FILENAME = '/usr/share/fonts/truetype/ttf-bitstream-vera/Vera.ttf'
// should be changed to path to the font on your system
won't work because of filesystem differences.
searchdir = '/sys/bus/hid/devices'
// take a look at "pywinusb" library (?)
won't work either, you have to get the devices in a different way. Not sure from where though. The same problem is
filepath = '/dev/char/%d:%d' % (devnums[0], devnums[1])
this isn't accessible in Windows and you have to do in a different way.
Besides that everything else looks OS independent. If you have any errors after fixing previous 3 problems, then edit them into your question please.

Categories

Resources