Python script taking too much memory [duplicate] - python

I've recently become interested in algorithms and have begun exploring them by writing a naive implementation and then optimizing it in various ways.
I'm already familiar with the standard Python module for profiling runtime (for most things I've found the timeit magic function in IPython to be sufficient), but I'm also interested in memory usage so I can explore those tradeoffs as well (e.g. the cost of caching a table of previously computed values versus recomputing them as needed). Is there a module that will profile the memory usage of a given function for me?

Python 3.4 includes a new module: tracemalloc. It provides detailed statistics about which code is allocating the most memory. Here's an example that displays the top three lines allocating memory.
from collections import Counter
import linecache
import os
import tracemalloc
def display_top(snapshot, key_type='lineno', limit=3):
snapshot = snapshot.filter_traces((
tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
tracemalloc.Filter(False, "<unknown>"),
))
top_stats = snapshot.statistics(key_type)
print("Top %s lines" % limit)
for index, stat in enumerate(top_stats[:limit], 1):
frame = stat.traceback[0]
# replace "/path/to/module/file.py" with "module/file.py"
filename = os.sep.join(frame.filename.split(os.sep)[-2:])
print("#%s: %s:%s: %.1f KiB"
% (index, filename, frame.lineno, stat.size / 1024))
line = linecache.getline(frame.filename, frame.lineno).strip()
if line:
print(' %s' % line)
other = top_stats[limit:]
if other:
size = sum(stat.size for stat in other)
print("%s other: %.1f KiB" % (len(other), size / 1024))
total = sum(stat.size for stat in top_stats)
print("Total allocated size: %.1f KiB" % (total / 1024))
tracemalloc.start()
counts = Counter()
fname = '/usr/share/dict/american-english'
with open(fname) as words:
words = list(words)
for word in words:
prefix = word[:3]
counts[prefix] += 1
print('Top prefixes:', counts.most_common(3))
snapshot = tracemalloc.take_snapshot()
display_top(snapshot)
And here are the results:
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)]
Top 3 lines
#1: scratches/memory_test.py:37: 6527.1 KiB
words = list(words)
#2: scratches/memory_test.py:39: 247.7 KiB
prefix = word[:3]
#3: scratches/memory_test.py:40: 193.0 KiB
counts[prefix] += 1
4 other: 4.3 KiB
Total allocated size: 6972.1 KiB
When is a memory leak not a leak?
That example is great when the memory is still being held at the end of the calculation, but sometimes you have code that allocates a lot of memory and then releases it all. It's not technically a memory leak, but it's using more memory than you think it should. How can you track memory usage when it all gets released? If it's your code, you can probably add some debugging code to take snapshots while it's running. If not, you can start a background thread to monitor memory usage while the main thread runs.
Here's the previous example where the code has all been moved into the count_prefixes() function. When that function returns, all the memory is released. I also added some sleep() calls to simulate a long-running calculation.
from collections import Counter
import linecache
import os
import tracemalloc
from time import sleep
def count_prefixes():
sleep(2) # Start up time.
counts = Counter()
fname = '/usr/share/dict/american-english'
with open(fname) as words:
words = list(words)
for word in words:
prefix = word[:3]
counts[prefix] += 1
sleep(0.0001)
most_common = counts.most_common(3)
sleep(3) # Shut down time.
return most_common
def main():
tracemalloc.start()
most_common = count_prefixes()
print('Top prefixes:', most_common)
snapshot = tracemalloc.take_snapshot()
display_top(snapshot)
def display_top(snapshot, key_type='lineno', limit=3):
snapshot = snapshot.filter_traces((
tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
tracemalloc.Filter(False, "<unknown>"),
))
top_stats = snapshot.statistics(key_type)
print("Top %s lines" % limit)
for index, stat in enumerate(top_stats[:limit], 1):
frame = stat.traceback[0]
# replace "/path/to/module/file.py" with "module/file.py"
filename = os.sep.join(frame.filename.split(os.sep)[-2:])
print("#%s: %s:%s: %.1f KiB"
% (index, filename, frame.lineno, stat.size / 1024))
line = linecache.getline(frame.filename, frame.lineno).strip()
if line:
print(' %s' % line)
other = top_stats[limit:]
if other:
size = sum(stat.size for stat in other)
print("%s other: %.1f KiB" % (len(other), size / 1024))
total = sum(stat.size for stat in top_stats)
print("Total allocated size: %.1f KiB" % (total / 1024))
main()
When I run that version, the memory usage has gone from 6MB down to 4KB, because the function released all its memory when it finished.
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)]
Top 3 lines
#1: collections/__init__.py:537: 0.7 KiB
self.update(*args, **kwds)
#2: collections/__init__.py:555: 0.6 KiB
return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
#3: python3.6/heapq.py:569: 0.5 KiB
result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
10 other: 2.2 KiB
Total allocated size: 4.0 KiB
Now here's a version inspired by another answer that starts a second thread to monitor memory usage.
from collections import Counter
import linecache
import os
import tracemalloc
from datetime import datetime
from queue import Queue, Empty
from resource import getrusage, RUSAGE_SELF
from threading import Thread
from time import sleep
def memory_monitor(command_queue: Queue, poll_interval=1):
tracemalloc.start()
old_max = 0
snapshot = None
while True:
try:
command_queue.get(timeout=poll_interval)
if snapshot is not None:
print(datetime.now())
display_top(snapshot)
return
except Empty:
max_rss = getrusage(RUSAGE_SELF).ru_maxrss
if max_rss > old_max:
old_max = max_rss
snapshot = tracemalloc.take_snapshot()
print(datetime.now(), 'max RSS', max_rss)
def count_prefixes():
sleep(2) # Start up time.
counts = Counter()
fname = '/usr/share/dict/american-english'
with open(fname) as words:
words = list(words)
for word in words:
prefix = word[:3]
counts[prefix] += 1
sleep(0.0001)
most_common = counts.most_common(3)
sleep(3) # Shut down time.
return most_common
def main():
queue = Queue()
poll_interval = 0.1
monitor_thread = Thread(target=memory_monitor, args=(queue, poll_interval))
monitor_thread.start()
try:
most_common = count_prefixes()
print('Top prefixes:', most_common)
finally:
queue.put('stop')
monitor_thread.join()
def display_top(snapshot, key_type='lineno', limit=3):
snapshot = snapshot.filter_traces((
tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
tracemalloc.Filter(False, "<unknown>"),
))
top_stats = snapshot.statistics(key_type)
print("Top %s lines" % limit)
for index, stat in enumerate(top_stats[:limit], 1):
frame = stat.traceback[0]
# replace "/path/to/module/file.py" with "module/file.py"
filename = os.sep.join(frame.filename.split(os.sep)[-2:])
print("#%s: %s:%s: %.1f KiB"
% (index, filename, frame.lineno, stat.size / 1024))
line = linecache.getline(frame.filename, frame.lineno).strip()
if line:
print(' %s' % line)
other = top_stats[limit:]
if other:
size = sum(stat.size for stat in other)
print("%s other: %.1f KiB" % (len(other), size / 1024))
total = sum(stat.size for stat in top_stats)
print("Total allocated size: %.1f KiB" % (total / 1024))
main()
The resource module lets you check the current memory usage, and save the snapshot from the peak memory usage. The queue lets the main thread tell the memory monitor thread when to print its report and shut down. When it runs, it shows the memory being used by the list() call:
2018-05-29 10:34:34.441334 max RSS 10188
2018-05-29 10:34:36.475707 max RSS 23588
2018-05-29 10:34:36.616524 max RSS 38104
2018-05-29 10:34:36.772978 max RSS 45924
2018-05-29 10:34:36.929688 max RSS 46824
2018-05-29 10:34:37.087554 max RSS 46852
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)]
2018-05-29 10:34:56.281262
Top 3 lines
#1: scratches/scratch.py:36: 6527.0 KiB
words = list(words)
#2: scratches/scratch.py:38: 16.4 KiB
prefix = word[:3]
#3: scratches/scratch.py:39: 10.1 KiB
counts[prefix] += 1
19 other: 10.8 KiB
Total allocated size: 6564.3 KiB
If you're on Linux, you may find /proc/self/statm more useful than the resource module.

This one has been answered already here: Python memory profiler
Basically you do something like that (cited from Guppy-PE):
>>> from guppy import hpy; h=hpy()
>>> h.heap()
Partition of a set of 48477 objects. Total size = 3265516 bytes.
Index Count % Size % Cumulative % Kind (class / dict of class)
0 25773 53 1612820 49 1612820 49 str
1 11699 24 483960 15 2096780 64 tuple
2 174 0 241584 7 2338364 72 dict of module
3 3478 7 222592 7 2560956 78 types.CodeType
4 3296 7 184576 6 2745532 84 function
5 401 1 175112 5 2920644 89 dict of class
6 108 0 81888 3 3002532 92 dict (no owner)
7 114 0 79632 2 3082164 94 dict of type
8 117 0 51336 2 3133500 96 type
9 667 1 24012 1 3157512 97 __builtin__.wrapper_descriptor
<76 more rows. Type e.g. '_.more' to view.>
>>> h.iso(1,[],{})
Partition of a set of 3 objects. Total size = 176 bytes.
Index Count % Size % Cumulative % Kind (class / dict of class)
0 1 33 136 77 136 77 dict (no owner)
1 1 33 28 16 164 93 list
2 1 33 12 7 176 100 int
>>> x=[]
>>> h.iso(x).sp
0: h.Root.i0_modules['__main__'].__dict__['x']
>>>

If you only want to look at the memory usage of an object, (answer to other question)
There is a module called Pympler which contains the asizeof
module.
Use as follows:
from pympler import asizeof
asizeof.asizeof(my_object)
Unlike sys.getsizeof, it works for your self-created objects.
>>> asizeof.asizeof(tuple('bcd'))
200
>>> asizeof.asizeof({'foo': 'bar', 'baz': 'bar'})
400
>>> asizeof.asizeof({})
280
>>> asizeof.asizeof({'foo':'bar'})
360
>>> asizeof.asizeof('foo')
40
>>> asizeof.asizeof(Bar())
352
>>> asizeof.asizeof(Bar().__dict__)
280
>>> help(asizeof.asizeof)
Help on function asizeof in module pympler.asizeof:
asizeof(*objs, **opts)
Return the combined size in bytes of all objects passed as positional arguments.

Disclosure:
Applicable on Linux only
Reports memory used by the current process as a whole, not individual functions within
But nice because of its simplicity:
import resource
def using(point=""):
usage=resource.getrusage(resource.RUSAGE_SELF)
return '''%s: usertime=%s systime=%s mem=%s mb
'''%(point,usage[0],usage[1],
usage[2]/1024.0 )
Just insert using("Label") where you want to see what's going on. For example
print(using("before"))
wrk = ["wasting mem"] * 1000000
print(using("after"))
>>> before: usertime=2.117053 systime=1.703466 mem=53.97265625 mb
>>> after: usertime=2.12023 systime=1.70708 mem=60.8828125 mb

Below is a simple function decorator which allows to track how much memory the process consumed before the function call, after the function call, and what is the difference:
import time
import os
import psutil
def elapsed_since(start):
return time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
def get_process_memory():
process = psutil.Process(os.getpid())
mem_info = process.memory_info()
return mem_info.rss
def profile(func):
def wrapper(*args, **kwargs):
mem_before = get_process_memory()
start = time.time()
result = func(*args, **kwargs)
elapsed_time = elapsed_since(start)
mem_after = get_process_memory()
print("{}: memory before: {:,}, after: {:,}, consumed: {:,}; exec time: {}".format(
func.__name__,
mem_before, mem_after, mem_after - mem_before,
elapsed_time))
return result
return wrapper
Here is my blog which describes all the details. (archived link)

Since the accepted answer and also the next highest voted answer have, in my opinion, some problems, I'd like to offer one more answer that is based closely on Ihor B.'s answer with some small but important modifications.
This solution allows you to run profiling on either by wrapping a function call with the profile function and calling it, or by decorating your function/method with the #profile decorator.
The first technique is useful when you want to profile some third-party code without messing with its source, whereas the second technique is a bit "cleaner" and works better when you are don't mind modifying the source of the function/method you want to profile.
I've also modified the output, so that you get RSS, VMS, and shared memory. I don't care much about the "before" and "after" values, but only the delta, so I removed those (if you're comparing to Ihor B.'s answer).
Profiling code
# profile.py
import time
import os
import psutil
import inspect
def elapsed_since(start):
#return time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
elapsed = time.time() - start
if elapsed < 1:
return str(round(elapsed*1000,2)) + "ms"
if elapsed < 60:
return str(round(elapsed, 2)) + "s"
if elapsed < 3600:
return str(round(elapsed/60, 2)) + "min"
else:
return str(round(elapsed / 3600, 2)) + "hrs"
def get_process_memory():
process = psutil.Process(os.getpid())
mi = process.memory_info()
return mi.rss, mi.vms, mi.shared
def format_bytes(bytes):
if abs(bytes) < 1000:
return str(bytes)+"B"
elif abs(bytes) < 1e6:
return str(round(bytes/1e3,2)) + "kB"
elif abs(bytes) < 1e9:
return str(round(bytes / 1e6, 2)) + "MB"
else:
return str(round(bytes / 1e9, 2)) + "GB"
def profile(func, *args, **kwargs):
def wrapper(*args, **kwargs):
rss_before, vms_before, shared_before = get_process_memory()
start = time.time()
result = func(*args, **kwargs)
elapsed_time = elapsed_since(start)
rss_after, vms_after, shared_after = get_process_memory()
print("Profiling: {:>20} RSS: {:>8} | VMS: {:>8} | SHR {"
":>8} | time: {:>8}"
.format("<" + func.__name__ + ">",
format_bytes(rss_after - rss_before),
format_bytes(vms_after - vms_before),
format_bytes(shared_after - shared_before),
elapsed_time))
return result
if inspect.isfunction(func):
return wrapper
elif inspect.ismethod(func):
return wrapper(*args,**kwargs)
Example usage, assuming the above code is saved as profile.py:
from profile import profile
from time import sleep
from sklearn import datasets # Just an example of 3rd party function call
# Method 1
run_profiling = profile(datasets.load_digits)
data = run_profiling()
# Method 2
#profile
def my_function():
# do some stuff
a_list = []
for i in range(1,100000):
a_list.append(i)
return a_list
res = my_function()
This should result in output similar to the below:
Profiling: <load_digits> RSS: 5.07MB | VMS: 4.91MB | SHR 73.73kB | time: 89.99ms
Profiling: <my_function> RSS: 1.06MB | VMS: 1.35MB | SHR 0B | time: 8.43ms
A couple of important final notes:
Keep in mind, this method of profiling is only going to be approximate, since lots of other stuff might be happening on the machine. Due to garbage collection and other factors, the deltas might even be zero.
For some unknown reason, very short function calls (e.g. 1 or 2 ms)
show up with zero memory usage. I suspect this is some limitation of
the hardware/OS (tested on basic laptop with Linux) on how often
memory statistics are updated.
To keep the examples simple, I didn't use any function arguments, but they should work as one would expect, i.e.
profile(my_function, arg) to profile my_function(arg)

A simple example to calculate the memory usage of a block of codes / function using memory_profile, while returning result of the function:
import memory_profiler as mp
def fun(n):
tmp = []
for i in range(n):
tmp.extend(list(range(i*i)))
return "XXXXX"
calculate memory usage before running the code then calculate max usage during the code:
start_mem = mp.memory_usage(max_usage=True)
res = mp.memory_usage(proc=(fun, [100]), max_usage=True, retval=True)
print('start mem', start_mem)
print('max mem', res[0][0])
print('used mem', res[0][0]-start_mem)
print('fun output', res[1])
calculate usage in sampling points while running function:
res = mp.memory_usage((fun, [100]), interval=.001, retval=True)
print('min mem', min(res[0]))
print('max mem', max(res[0]))
print('used mem', max(res[0])-min(res[0]))
print('fun output', res[1])
Credits: #skeept

maybe it help:
<see additional>
pip install gprof2dot
sudo apt-get install graphviz
gprof2dot -f pstats profile_for_func1_001 | dot -Tpng -o profile.png
def profileit(name):
"""
#profileit("profile_for_func1_001")
"""
def inner(func):
def wrapper(*args, **kwargs):
prof = cProfile.Profile()
retval = prof.runcall(func, *args, **kwargs)
# Note use of name from outer scope
prof.dump_stats(name)
return retval
return wrapper
return inner
#profileit("profile_for_func1_001")
def func1(...)

Related

python myhdl package how to generate verilog initial block

From the code mostly from the sample of myhdl:
from myhdl import Signal, intbv, delay, always, now, Simulation, toVerilog
__debug = True
def ClkDriver(clk):
halfPeriod = delay(10)
#always(halfPeriod)
def driveClk():
clk.next = not clk
return driveClk
def HelloWorld(clk, outs):
counts = intbv(3)[32:]
#always(clk.posedge)
def sayHello():
outs.next = not outs
if counts >= 3 - 1:
counts.next = 0
else:
counts.next = counts + 1
if __debug__:
print "%s Hello World! outs %s %s" % (
now(), str(outs), str(outs.next))
return sayHello
clk = Signal(bool(0))
outs = Signal(intbv(0)[1:])
clkdriver_inst = ClkDriver(clk)
hello_inst = toVerilog(HelloWorld, clk, outs)
sim = Simulation(clkdriver_inst, hello_inst)
sim.run(150)
I expect it to generate a verilog program that contains an initial block, like something:
module HelloWorld(...)
reg [31:0] counts;
initial begin
counts = 32'h3
end
always #(...
How can you get the initial block generated?
Note that on the google cache for old.myhdl.org/doku.php/dev:initial_values it links to example https://bitbucket.org/cfelton/examples/src/tip/ramrom/ . So it looks the feature should be supported. However the rom sample generates static case statements. That's not what I'm looking for.
Three steps to resolve it:
Update to the latest myhdl on master or a version that contains the hash 87784ad which added the feature under issue #105 or #150. As an example for virtualenv, run a git clone, followed by pip install -e <path-to-myhdl-dir>.
Change the signal to a list.
Set toVerilog.initial_values=True before calling toVerilog.
Code snippet follows.
def HelloWorld(clk, outs):
counts = [Signal(intbv(3)[32:])]
#always(clk.posedge)
def sayHello():
outs.next = not outs
if counts[0] >= 3 - 1:
counts[0].next = 0
else:
counts[0].next = counts[0] + 1
if __debug__:
print "%s Hello World! outs %s %s %d" % (
now(), str(outs), str(outs.next), counts[0])
return sayHello
clk = Signal(bool(0))
outs = Signal(intbv(0)[1:])
clkdriver_inst = ClkDriver(clk)
toVerilog.initial_values=True
hello_inst = toVerilog(HelloWorld, clk, outs)
sim = Simulation(clkdriver_inst, hello_inst)
sim.run(150)

Size of file, human readable [duplicate]

A function to return human readable size from bytes size:
>>> human_readable(2048)
'2 kilobytes'
>>>
How to do this?
Addressing the above "too small a task to require a library" issue by a straightforward implementation (using f-strings, so Python 3.6+):
def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1024.0
return f"{num:.1f}Yi{suffix}"
Supports:
all currently known binary prefixes
negative and positive numbers
numbers larger than 1000 Yobibytes
arbitrary units (maybe you like to count in Gibibits!)
Example:
>>> sizeof_fmt(168963795964)
'157.4GiB'
by Fred Cirera
A library that has all the functionality that it seems you're looking for is humanize. humanize.naturalsize() seems to do everything you're looking for.
Example code (python 3.10)
import humanize
disk_sizes_list = [1, 100, 999, 1000,1024, 2000,2048, 3000, 9999, 10000, 2048000000, 9990000000, 9000000000000000000000]
for size in disk_sizes_list:
natural_size = humanize.naturalsize(size)
binary_size = humanize.naturalsize(size, binary=True)
print(f" {natural_size} \t| {binary_size}\t|{size}")
Output
1 Byte | 1 Byte |1
100 Bytes | 100 Bytes |100
999 Bytes | 999 Bytes |999
1.0 kB | 1000 Bytes |1000
1.0 kB | 1.0 KiB |1024
2.0 kB | 2.0 KiB |2000
2.0 kB | 2.0 KiB |2048
3.0 kB | 2.9 KiB |3000
10.0 kB | 9.8 KiB |9999
10.0 kB | 9.8 KiB |10000
2.0 GB | 1.9 GiB |2048000000
10.0 GB | 9.3 GiB |9990000000
9.0 ZB | 7.6 ZiB |9000000000000000000000
The following works in Python 3.6+, is, in my opinion, the easiest to understand answer on here, and lets you customize the amount of decimal places used.
def human_readable_size(size, decimal_places=2):
for unit in ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']:
if size < 1024.0 or unit == 'PiB':
break
size /= 1024.0
return f"{size:.{decimal_places}f} {unit}"
There's always got to be one of those guys. Well today it's me. Here's a one-liner -- or two lines if you count the function signature.
def human_size(bytes, units=[' bytes','KB','MB','GB','TB', 'PB', 'EB']):
""" Returns a human readable string representation of bytes """
return str(bytes) + units[0] if bytes < 1024 else human_size(bytes>>10, units[1:])
 
>>> human_size(123)
123 bytes
>>> human_size(123456789)
117GB
If you need sizes bigger than an Exabyte, it's a little bit more gnarly:
def human_size(bytes, units=[' bytes','KB','MB','GB','TB', 'PB', 'EB']):
return str(bytes) + units[0] if bytes < 1024 else human_size(bytes>>10, units[1:]) if units[1:] else f'{bytes>>10}ZB'
Here's my version. It does not use a for-loop. It has constant complexity, O(1), and is in theory more efficient than the answers here that use a for-loop.
from math import log
unit_list = zip(['bytes', 'kB', 'MB', 'GB', 'TB', 'PB'], [0, 0, 1, 2, 2, 2])
def sizeof_fmt(num):
"""Human friendly file size"""
if num > 1:
exponent = min(int(log(num, 1024)), len(unit_list) - 1)
quotient = float(num) / 1024**exponent
unit, num_decimals = unit_list[exponent]
format_string = '{:.%sf} {}' % (num_decimals)
return format_string.format(quotient, unit)
if num == 0:
return '0 bytes'
if num == 1:
return '1 byte'
To make it more clear what is going on, we can omit the code for the string formatting. Here are the lines that actually do the work:
exponent = int(log(num, 1024))
quotient = num / 1024**exponent
unit_list[exponent]
I recently came up with a version that avoids loops, using log2 to determine the size order which doubles as a shift and an index into the suffix list:
from math import log2
_suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
def file_size(size):
# determine binary order in steps of size 10
# (coerce to int, // still returns a float)
order = int(log2(size) / 10) if size else 0
# format file size
# (.4g results in rounded numbers for exact matches and max 3 decimals,
# should never resort to exponent values)
return '{:.4g} {}'.format(size / (1 << (order * 10)), _suffixes[order])
Could well be considered unpythonic for its readability, though.
If you're using Django installed you can also try filesizeformat:
from django.template.defaultfilters import filesizeformat
filesizeformat(1073741824)
=>
"1.0 GB"
You should use "humanize".
>>> humanize.naturalsize(1000000)
'1.0 MB'
>>> humanize.naturalsize(1000000, binary=True)
'976.6 KiB'
>>> humanize.naturalsize(1000000, gnu=True)
'976.6K'
Reference:
https://pypi.org/project/humanize/
One such library is hurry.filesize.
>>> from hurry.filesize import alternative
>>> size(1, system=alternative)
'1 byte'
>>> size(10, system=alternative)
'10 bytes'
>>> size(1024, system=alternative)
'1 KB'
Using either powers of 1000 or kibibytes would be more standard-friendly:
def sizeof_fmt(num, use_kibibyte=True):
base, suffix = [(1000.,'B'),(1024.,'iB')][use_kibibyte]
for x in ['B'] + map(lambda x: x+suffix, list('kMGTP')):
if -base < num < base:
return "%3.1f %s" % (num, x)
num /= base
return "%3.1f %s" % (num, x)
P.S. Never trust a library that prints thousands with the K (uppercase) suffix :)
The HumanFriendly project helps with this.
import humanfriendly
humanfriendly.format_size(1024)
The above code will give 1KB as answer.
Examples can be found here.
Riffing on the snippet provided as an alternative to hurry.filesize(), here is a snippet that gives varying precision numbers based on the prefix used. It isn't as terse as some snippets, but I like the results.
def human_size(size_bytes):
"""
format a size in bytes into a 'human' file size, e.g. bytes, KB, MB, GB, TB, PB
Note that bytes/KB will be reported in whole numbers but MB and above will have greater precision
e.g. 1 byte, 43 bytes, 443 KB, 4.3 MB, 4.43 GB, etc
"""
if size_bytes == 1:
# because I really hate unnecessary plurals
return "1 byte"
suffixes_table = [('bytes',0),('KB',0),('MB',1),('GB',2),('TB',2), ('PB',2)]
num = float(size_bytes)
for suffix, precision in suffixes_table:
if num < 1024.0:
break
num /= 1024.0
if precision == 0:
formatted_size = "%d" % num
else:
formatted_size = str(round(num, ndigits=precision))
return "%s %s" % (formatted_size, suffix)
This will do what you need in almost any situation, is customizable with optional arguments, and as you can see, is pretty much self-documenting:
from math import log
def pretty_size(n,pow=0,b=1024,u='B',pre=['']+[p+'i'for p in'KMGTPEZY']):
pow,n=min(int(log(max(n*b**pow,1),b)),len(pre)-1),n*b**pow
return "%%.%if %%s%%s"%abs(pow%(-pow-1))%(n/b**float(pow),pre[pow],u)
Example output:
>>> pretty_size(42)
'42 B'
>>> pretty_size(2015)
'2.0 KiB'
>>> pretty_size(987654321)
'941.9 MiB'
>>> pretty_size(9876543210)
'9.2 GiB'
>>> pretty_size(0.5,pow=1)
'512 B'
>>> pretty_size(0)
'0 B'
Advanced customizations:
>>> pretty_size(987654321,b=1000,u='bytes',pre=['','kilo','mega','giga'])
'987.7 megabytes'
>>> pretty_size(9876543210,b=1000,u='bytes',pre=['','kilo','mega','giga'])
'9.9 gigabytes'
This code is both Python 2 and Python 3 compatible. PEP8 compliance is an exercise for the reader. Remember, it's the output that's pretty.
Update:
If you need thousands commas, just apply the obvious extension:
def prettier_size(n,pow=0,b=1024,u='B',pre=['']+[p+'i'for p in'KMGTPEZY']):
r,f=min(int(log(max(n*b**pow,1),b)),len(pre)-1),'{:,.%if} %s%s'
return (f%(abs(r%(-r-1)),pre[r],u)).format(n*b**pow/b**float(r))
For example:
>>> pretty_units(987654321098765432109876543210)
'816,968.5 YiB'
Drawing from all the previous answers, here is my take on it. It's an object which will store the file size in bytes as an integer. But when you try to print the object, you automatically get a human readable version.
class Filesize(object):
"""
Container for a size in bytes with a human readable representation
Use it like this::
>>> size = Filesize(123123123)
>>> print size
'117.4 MB'
"""
chunk = 1024
units = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB']
precisions = [0, 0, 1, 2, 2, 2]
def __init__(self, size):
self.size = size
def __int__(self):
return self.size
def __str__(self):
if self.size == 0: return '0 bytes'
from math import log
unit = self.units[min(int(log(self.size, self.chunk)), len(self.units) - 1)]
return self.format(unit)
def format(self, unit):
if unit not in self.units: raise Exception("Not a valid file size unit: %s" % unit)
if self.size == 1 and unit == 'bytes': return '1 byte'
exponent = self.units.index(unit)
quotient = float(self.size) / self.chunk**exponent
precision = self.precisions[exponent]
format_string = '{:.%sf} {}' % (precision)
return format_string.format(quotient, unit)
Modern Django have self template tag filesizeformat:
Formats the value like a human-readable file size (i.e. '13 KB', '4.1 MB', '102 bytes', etc.).
For example:
{{ value|filesizeformat }}
If value is 123456789, the output would be 117.7 MB.
More info: https://docs.djangoproject.com/en/1.10/ref/templates/builtins/#filesizeformat
I like the fixed precision of senderle's decimal version, so here's a sort of hybrid of that with joctee's answer above (did you know you could take logs with non-integer bases?):
from math import log
def human_readable_bytes(x):
# hybrid of https://stackoverflow.com/a/10171475/2595465
# with https://stackoverflow.com/a/5414105/2595465
if x == 0: return '0'
magnitude = int(log(abs(x),10.24))
if magnitude > 16:
format_str = '%iP'
denominator_mag = 15
else:
float_fmt = '%2.1f' if magnitude % 3 == 1 else '%1.2f'
illion = (magnitude + 1) // 3
format_str = float_fmt + ['', 'K', 'M', 'G', 'T', 'P'][illion]
return (format_str % (x * 1.0 / (1024 ** illion))).lstrip('0')
To get the file size in a human readable form, I created this function:
import os
def get_size(path):
size = os.path.getsize(path)
if size < 1024:
return f"{size} bytes"
elif size < pow(1024,2):
return f"{round(size/1024, 2)} KB"
elif size < pow(1024,3):
return f"{round(size/(pow(1024,2)), 2)} MB"
elif size < pow(1024,4):
return f"{round(size/(pow(1024,3)), 2)} GB"
>>> get_size("a.txt")
1.4KB
Here is an oneliner lambda without any imports to convert to human readable filesize. Pass the value in bytes.
to_human = lambda v : str(v >> ((max(v.bit_length()-1, 0)//10)*10)) +["", "K", "M", "G", "T", "P", "E"][max(v.bit_length()-1, 0)//10]
>>> to_human(1024)
'1K'
>>> to_human(1024*1024*3)
'3M'
How about a simple 2 liner:
def humanizeFileSize(filesize):
p = int(math.floor(math.log(filesize, 2)/10))
return "%.3f%s" % (filesize/math.pow(1024,p), ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][p])
Here is how it works under the hood:
Calculates log2(filesize)
Divides it by 10 to get the closest unit. (eg if size is 5000 bytes, the closest unit is Kb, so the answer should be X KiB)
Returns file_size/value_of_closest_unit along with unit.
It however doesn't work if filesize is 0 or negative (because log is undefined for 0 and -ve numbers). You can add extra checks for them:
def humanizeFileSize(filesize):
filesize = abs(filesize)
if (filesize==0):
return "0 Bytes"
p = int(math.floor(math.log(filesize, 2)/10))
return "%0.2f %s" % (filesize/math.pow(1024,p), ['Bytes','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][p])
Examples:
>>> humanizeFileSize(538244835492574234)
'478.06 PiB'
>>> humanizeFileSize(-924372537)
'881.55 MiB'
>>> humanizeFileSize(0)
'0 Bytes'
NOTE - There is a difference between Kb and KiB. KB means 1000 bytes, whereas KiB means 1024 bytes. KB,MB,GB are all multiples of 1000, whereas KiB, MiB, GiB etc are all multiples of 1024. More about it here
What you're about to find below is by no means the most performant or shortest solution among the ones already posted. Instead, it focuses on one particular issue that many of the other answers miss.
Namely the case when input like 999_995 is given:
Python 3.6.1 ...
...
>>> value = 999_995
>>> base = 1000
>>> math.log(value, base)
1.999999276174054
which, being truncated to the nearest integer and applied back to the input gives
>>> order = int(math.log(value, base))
>>> value/base**order
999.995
This seems to be exactly what we'd expect until we're required to control output precision. And this is when things start to get a bit difficult.
With the precision set to 2 digits we get:
>>> round(value/base**order, 2)
1000 # K
instead of 1M.
How can we counter that?
Of course, we can check for it explicitly:
if round(value/base**order, 2) == base:
order += 1
But can we do better? Can we get to know which way the order should be cut before we do the final step?
It turns out we can.
Assuming 0.5 decimal rounding rule, the above if condition translates into:
resulting in
def abbreviate(value, base=1000, precision=2, suffixes=None):
if suffixes is None:
suffixes = ['', 'K', 'M', 'B', 'T']
if value == 0:
return f'{0}{suffixes[0]}'
order_max = len(suffixes) - 1
order = log(abs(value), base)
order_corr = order - int(order) >= log(base - 0.5/10**precision, base)
order = min(int(order) + order_corr, order_max)
factored = round(value/base**order, precision)
return f'{factored:,g}{suffixes[order]}'
giving
>>> abbreviate(999_994)
'999.99K'
>>> abbreviate(999_995)
'1M'
>>> abbreviate(999_995, precision=3)
'999.995K'
>>> abbreviate(2042, base=1024)
'1.99K'
>>> abbreviate(2043, base=1024)
'2K'
def human_readable_data_quantity(quantity, multiple=1024):
if quantity == 0:
quantity = +0
SUFFIXES = ["B"] + [i + {1000: "B", 1024: "iB"}[multiple] for i in "KMGTPEZY"]
for suffix in SUFFIXES:
if quantity < multiple or suffix == SUFFIXES[-1]:
if suffix == SUFFIXES[0]:
return "%d%s" % (quantity, suffix)
else:
return "%.1f%s" % (quantity, suffix)
else:
quantity /= multiple
This feature if available in Boltons which is a very handy library to have for most projects.
>>> bytes2human(128991)
'126K'
>>> bytes2human(100001221)
'95M'
>>> bytes2human(0, 2)
'0.00B'
Here's something I wrote for a different question...
Much like xApple's answer, this object will always print in a human-readable format. The difference is that it's also a proper int, so you can do math with it!
It passes the format specifier straight through to the number format and tacks on the suffix, so it's pretty much guaranteed that the requested length will be exceeded by two or three characters. I've never had a use for this code, so I haven't bothered to fix it!
class ByteSize(int):
_KB = 1024
_suffixes = 'B', 'KB', 'MB', 'GB', 'PB'
def __new__(cls, *args, **kwargs):
return super().__new__(cls, *args, **kwargs)
def __init__(self, *args, **kwargs):
self.bytes = self.B = int(self)
self.kilobytes = self.KB = self / self._KB**1
self.megabytes = self.MB = self / self._KB**2
self.gigabytes = self.GB = self / self._KB**3
self.petabytes = self.PB = self / self._KB**4
*suffixes, last = self._suffixes
suffix = next((
suffix
for suffix in suffixes
if 1 < getattr(self, suffix) < self._KB
), last)
self.readable = suffix, getattr(self, suffix)
super().__init__()
def __str__(self):
return self.__format__('.2f')
def __repr__(self):
return '{}({})'.format(self.__class__.__name__, super().__repr__())
def __format__(self, format_spec):
suffix, val = self.readable
return '{val:{fmt}} {suf}'.format(val=val, fmt=format_spec, suf=suffix)
def __sub__(self, other):
return self.__class__(super().__sub__(other))
def __add__(self, other):
return self.__class__(super().__add__(other))
def __mul__(self, other):
return self.__class__(super().__mul__(other))
def __rsub__(self, other):
return self.__class__(super().__sub__(other))
def __radd__(self, other):
return self.__class__(super().__add__(other))
def __rmul__(self, other):
return self.__class__(super().__rmul__(other))
Usage:
>>> size = 6239397620
>>> print(size)
5.81 GB
>>> size.GB
5.810891855508089
>>> size.gigabytes
5.810891855508089
>>> size.PB
0.005674699077644618
>>> size.MB
5950.353260040283
>>> size
ByteSize(6239397620)
In case someone is wondering, to convert #Sridhar Ratnakumar's answer back to bytes you could do the following:
import math
def format_back_to_bytes(value):
for power, unit in enumerate(["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]):
if value[-3:-1] == unit:
return round(float(value[:-3])*math.pow(2, 10*power))
Usage:
>>> format_back_to_bytes('212.4GiB')
228062763418
Here is an option using while:
def number_format(n):
n2, n3 = n, 0
while n2 >= 1e3:
n2 /= 1e3
n3 += 1
return '%.3f' % n2 + ('', ' k', ' M', ' G')[n3]
s = number_format(9012345678)
print(s == '9.012 G')
https://docs.python.org/reference/compound_stmts.html#while
Referencing Sridhar Ratnakumar's answer, updated to:
def formatSize(sizeInBytes, decimalNum=1, isUnitWithI=False, sizeUnitSeperator=""):
"""format size to human readable string"""
# https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000
# K=kilo, M=mega, G=giga, T=tera, P=peta, E=exa, Z=zetta, Y=yotta
sizeUnitList = ['','K','M','G','T','P','E','Z']
largestUnit = 'Y'
if isUnitWithI:
sizeUnitListWithI = []
for curIdx, eachUnit in enumerate(sizeUnitList):
unitWithI = eachUnit
if curIdx >= 1:
unitWithI += 'i'
sizeUnitListWithI.append(unitWithI)
# sizeUnitListWithI = ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']
sizeUnitList = sizeUnitListWithI
largestUnit += 'i'
suffix = "B"
decimalFormat = "." + str(decimalNum) + "f" # ".1f"
finalFormat = "%" + decimalFormat + sizeUnitSeperator + "%s%s" # "%.1f%s%s"
sizeNum = sizeInBytes
for sizeUnit in sizeUnitList:
if abs(sizeNum) < 1024.0:
return finalFormat % (sizeNum, sizeUnit, suffix)
sizeNum /= 1024.0
return finalFormat % (sizeNum, largestUnit, suffix)
and example output is:
def testKb():
kbSize = 3746
kbStr = formatSize(kbSize)
print("%s -> %s" % (kbSize, kbStr))
def testI():
iSize = 87533
iStr = formatSize(iSize, isUnitWithI=True)
print("%s -> %s" % (iSize, iStr))
def testSeparator():
seperatorSize = 98654
seperatorStr = formatSize(seperatorSize, sizeUnitSeperator=" ")
print("%s -> %s" % (seperatorSize, seperatorStr))
def testBytes():
bytesSize = 352
bytesStr = formatSize(bytesSize)
print("%s -> %s" % (bytesSize, bytesStr))
def testMb():
mbSize = 76383285
mbStr = formatSize(mbSize, decimalNum=2)
print("%s -> %s" % (mbSize, mbStr))
def testTb():
tbSize = 763832854988542
tbStr = formatSize(tbSize, decimalNum=2)
print("%s -> %s" % (tbSize, tbStr))
def testPb():
pbSize = 763832854988542665
pbStr = formatSize(pbSize, decimalNum=4)
print("%s -> %s" % (pbSize, pbStr))
def demoFormatSize():
testKb()
testI()
testSeparator()
testBytes()
testMb()
testTb()
testPb()
# 3746 -> 3.7KB
# 87533 -> 85.5KiB
# 98654 -> 96.3 KB
# 352 -> 352.0B
# 76383285 -> 72.84MB
# 763832854988542 -> 694.70TB
# 763832854988542665 -> 678.4199PB
This solution might also appeal to you, depending on how your mind works:
from pathlib import Path
def get_size(path = Path('.')):
""" Gets file size, or total directory size """
if path.is_file():
size = path.stat().st_size
elif path.is_dir():
size = sum(file.stat().st_size for file in path.glob('*.*'))
return size
def format_size(path, unit="MB"):
""" Converts integers to common size units used in computing """
bit_shift = {"B": 0,
"kb": 7,
"KB": 10,
"mb": 17,
"MB": 20,
"gb": 27,
"GB": 30,
"TB": 40,}
return "{:,.0f}".format(get_size(path) / float(1 << bit_shift[unit])) + " " + unit
# Tests and test results
>>> get_size("d:\\media\\bags of fun.avi")
'38 MB'
>>> get_size("d:\\media\\bags of fun.avi","KB")
'38,763 KB'
>>> get_size("d:\\media\\bags of fun.avi","kb")
'310,104 kb'

Optimization of python multithreading script - huge memory consumption

I have a script (Django Management-Command) wiht over 800 lines of code.
This should import data from a external Web-Service, manipulate sth. and write it to a Postgres DB.
I use multithreading, because fetching data from webservice ist not very fast.
There ist one Thread for fetching the data with a bulk command to get a bulk of 64 data sets an write each data set in a queue.
Simultaneously at the beginning there is one worker-thread wich manipulates the data and write it to a DB.
In the main (handle) class, there is a while-loop that looks every 5 seconds for the quantity of elements in the queue and the quantity of running worker-threads.
If there are more than 500 elements in the queue and there are less then 5 worker-threads, it starts a new worker-thread.
All worker-threads get one item from the queue, manipulate sth., write the data set to the DB and append one String (up to 14 chars) to a different queue (#2).
The queue #2 ist necessary to have all imported objects at the end of the import to mark them as new respectively delete all other items from the DB, which are currently not imported.
For DB's with a quantity of not more then 200.000 data sets everything works fine.
But if there is for example a DB with 1.000.000 data sets, the memory consumption increases during the processing of the hole script up to 8 GB of RAM.
Is there a method to watch the memory consumption of threads and / or queue's?
Is there a method to "clean" memory after each while-loop?
# -*- coding: utf-8 -*-
import os
import threading
import Queue
import time
from optparse import OptionParser, make_option
from decimal import Decimal
from datetime import datetime
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.conf import settings
def is_someone_alive(thread_list):
so_alive = False
for t in thread_list:
if t.is_alive():
so_alive = True
return so_alive
class insert_item(threading.Thread):
VarLock2 = threading.Lock()
def __init__(self, queue1, item_still_exist2, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
self.name = name
self.queue1 = queue1
self.item_still_exist2 = item_still_exist2
def run(self):
while not self.queue1.empty() or getItemBulkThread.isrunning:
item = self.queue1.get()
artikelobj, created = Artikel.objects.get_or_create(artikelnr=item['Nr'])
"""
manipulate data
"""
self.item_still_exist2.put(artikelobj.artikelnr)
artikelobj.save()
self.queue1.task_done()
class getItemBulkThread(threading.Thread):
isrunning = True
VarLock = threading.Lock()
def __init__(self, queue1, name, *args, **options):
threading.Thread.__init__(self)
self.options = options
if self.options['nrStart'] != '':
self.nrab = self.options['nrStart']
else:
self.nrab = ''
self.name = name
#self.nrab = '701307'
self.queue1 = queue1
self.anz_artikel = 64
self.max_artikel = 64
self.skipped = 0
self.max_skip = 20
def run(self):
count_sleep = 0
while True:
while self.queue1.qsize() > 5000:
time.sleep(5)
count_sleep += 1
if count_sleep > 0:
print "~ Artikel-Import %(csleep)sx für 5s pausiert, da Queue-Size > 5000" % {'csleep': count_sleep}
count_sleep = 0
try:
items = getItemBulk() # from external service
except Exception as exc1:
if ('"normal" abort-condition' in str(exc1)):
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
break
elif self.anz_artikel > 1:
self.anz_artikel /= 2
continue
elif self.skipped <= self.max_skip:
self.nrab += 1
self.skipped += 1
time.sleep(5)
continue
elif self.skipped > self.max_skip:
raise Exception("[EXCEPTION] Fehler im Thread: too much items skipped")
else:
getItemBulkThread.VarLock.acquire()
getItemBulkThread.isrunning = False
getItemBulkThread.VarLock.release()
raise
last_item = len(items) - 1
self.nrab = items[last_item]['Nr']
for artikel in items:
artikel['katItem'] = False
self.queue1.put(artikel)
if self.anz_artikel < self.max_artikel:
self.anz_artikel *= 2
self.skipped = 0
class Command(BaseCommand):
"""
Django-mgm-command
"""
help = u'Import'
def create_parser(self, prog_name, subcommand):
"""
Create and return the ``OptionParser`` which will be used to
parse the arguments to this command.
"""
return OptionParser(prog=prog_name, usage=self.usage(subcommand),
version=self.get_version(),
option_list=self.option_list,
conflict_handler="resolve")
def handle(self, *args, **options):
startzeit = datetime.now()
anzahl_Artikel_vorher = Artikel.objects.all().count() # Artikel is a model
self.options = options
items_vorher = []
queue1 = Queue.Queue()
item_still_exists2 = Queue.Queue()
running_threads = []
thread = getItemBulkThread(queue1, name="Artikel", *args, **options)
running_threads.append(thread)
thread.daemon = True
thread.start()
anz_worker_threads = 1
anz_max_worker_threads = 5
insert_threads = [insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': i + 1}, *args, **options) for i in range(anz_worker_threads)]
for thread in insert_threads:
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
add_seconds = 5
element_grenze = 500
lastelemente = 0
asc_elemente = 0
anz_abgearbeitet = 0
while getItemBulkThread.isrunning or not queue1.empty():
time.sleep(add_seconds)
elemente = queue1.qsize()
akt_zeit = datetime.now()
diff_zeit = akt_zeit - startzeit
diff = elemente - lastelemente
anz_abgearbeitet = item_still_exists2.qsize()
art_speed = (anz_abgearbeitet / timedelta_total_seconds(diff_zeit)) * 60
ersetz_var = {'anz': elemente, 'zeit': diff_zeit, 'tstamp': akt_zeit.strftime('%Y.%m.%d-%H:%M:%S'), 'anzw': anz_worker_threads, 'diff': diff, 'anza': anz_abgearbeitet, 'art_speed': art_speed}
print("%(zeit)s vergangen - %(tstamp)s - %(anz)s Elemente in Queue, Veränderung: %(diff)s - Anz Worker: %(anzw)s - Artikel importiert: %(anza)s - Speed: %(art_speed)02d Art/Min" % ersetz_var)
if diff > 0:
asc_elemente += 1
else:
asc_elemente = 0
if asc_elemente > 2 and anz_worker_threads < anz_max_worker_threads and elemente > element_grenze:
ersetz_var = {'maxw': anz_max_worker_threads, 'nr': anz_worker_threads + 1, 'element_grenze': element_grenze}
print "~~ 2x in Folge mehr Queue-Elemente als vorher, die max. Anzahl an Workern %(maxw)s noch nicht erreicht und mehr als %(element_grenze)s Elemente in der Queue, daher Start eines neuen Workers (Nr %(nr)s)" % ersetz_var
anz_worker_threads += 1
thread = insert_item(queue1, item_still_exists2, name="Worker-%(anz)s" % {'anz': anz_worker_threads}, *args, **options)
running_threads.append(thread)
thread.setDaemon(True)
thread.start()
asc_elemente = 0
lastelemente = elemente
queue1.join()
items_nachher = []
while not item_still_exists2.empty():
item = item_still_exists2.get()
if item in items_vorher:
items_nachher.append(item)
items_vorher.remove(item)
item_still_exists2.task_done()
item_still_exists2.join()
if len(items_vorher) > 0:
Artikel.objects.filter(artikelnr__in=items_vorher).delete()
anzahl_Artikel_nachher = Artikel.objects.all().count()
anzahl_Artikel_diff = anzahl_Artikel_nachher - anzahl_Artikel_vorher
endzeit = datetime.now()
dauer = endzeit - startzeit
I've abbreviated the Code at some positions :)
A possible cause for excessive memory consumption is that you don't set a maximum size for the input queue. See the maxsize parameter.
On a related note, you write:
In the main (handle) class, there is a while-loop that looks every 5
seconds for the quantity of elements in the queue and the quantity of
running worker-threads. If there are more than 500 elements in the
queue and there are less then 5 worker-threads, it starts a new
worker-thread.
Creating a new thread does not necessarily increase the throughput. You should rather do some tests to determine the optimal number of threads, which may turn out to be 1.

Recursive directory list/analyze function doesn't seem to recurse right

I wrote what I thought was a straightforward Python script to traverse a given directory and tabulate all the file suffixes it finds. The output looks like this:
OTUS-ASIO:face fish$ sufs
>>> /Users/fish/Dropbox/ost2/face (total 194)
=== 1 1 -
=== css 16 -----
=== gif 14 -----
=== html 12 ----
=== icc 87 --------------------------
=== jpg 3 -
=== js 46 --------------
=== png 3 -
=== zip 2 -
... which would be great, if those values were correct. They are not. Here's what happens when I run it in a subdirectory of the directory I listed above:
OTUS-ASIO:face fish$ cd images/
OTUS-ASIO:images fish$ sufs
>>> /Users/fish/Dropbox/ost2/face/images (total 1016)
=== JPG 3 -
=== gif 17 -
=== ico 1 -
=== jpeg 1 -
=== jpg 901 --------------------------
=== png 87 ---
... It only seems to go one directory level down. Running the script one level up didn't pick up on the 'jpeg' suffix at all, and seemed to miss a good 898 jpg files.
The script in question is here:
#!/usr/bin/env python
# encoding: utf-8
"""
getfilesuffixes.py
Created by FI$H 2000 on 2010-10-15.
Copyright (c) 2010 OST, LLC. All rights reserved.
"""
import sys, os, getopt
help_message = '''
Prints a list of all the file suffixes found in each DIR, with counts.
Defaults to the current directory wth no args.
$ %s DIR [DIR DIR etc ...]
''' % os.path.basename(__file__)
dirs = dict()
skips = ('DS_Store','hgignore')
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def getmesomesuffixes(rootdir, thisdir=None):
if not thisdir:
thisdir = rootdir
for thing in [os.path.abspath(h) for h in os.listdir(thisdir)]:
if os.path.isdir(thing):
getmesomesuffixes(rootdir), thing)
else:
if thing.rfind('.') > -1:
suf = thing.rsplit('.').pop()
dirs[rootdir][suf] = dirs[rootdir].get(suf, 0) + 1
return
def main(argv=None):
if argv is None:
argv = sys.argv
try:
try:
opts, args = getopt.getopt(argv[1:], "h", ["help",])
except getopt.error, msg:
raise Usage(msg)
for option, value in opts:
if option == "-v":
verbose = True
if option in ("-h", "--help"):
raise Usage(help_message)
if len(args) == 0:
args.append(os.getcwd())
for durr in [os.path.abspath(arg) for arg in args]:
if os.path.isdir(durr):
dirs[durr] = dict()
for k, v in dirs.items():
getmesomesuffixes(k)
print ""
for k, v in dirs.items():
sufs = v.items()
sufs.sort()
maxcount = reduce(lambda fs, ns: fs > ns and fs or ns, map(lambda t: t[1], sufs), 1)
mincount = reduce(lambda fs, ns: fs < ns and fs or ns, map(lambda t: t[1], sufs), 1)
total = reduce(lambda fs, ns: fs + ns, map(lambda t: t[1], sufs), 0)
print ">>>\t\t\t%s (total %s)" % (k, total)
for suf, sufcount in sufs:
try:
skips.index(suf)
except ValueError:
print "===\t\t\t%12s\t %3s\t %s" % (suf, sufcount, "-" * (int(float(float(sufcount) / float(maxcount)) * 25) + 1))
print ""
except Usage, err:
print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
print >> sys.stderr, "\t for help use --help"
return 2
if __name__ == "__main__":
sys.exit(main())
It seems that getmesomesuffixes() is subtly not doing what I want it to. I hate to ask such an annoying question, but if anyone can spot whatever amateur-hour error I am making with a quick once-over, it would save me some serious frustration.
Yeah, Won't you be better off if you used os.walk
for root, dirs, files in os.walk(basedir):
... do you stuff ..
See the example at
http://docs.python.org/library/os.html
Also look at os.path.splitext(path), a finer way to find the type of your file.
>>> os.path.splitext('/d/c/as.jpeg')
('/d/c/as', '.jpeg')
>>>
Both of these together should simplify your code.
import os
import os.path
from collections import defaultdict
def foo(dir='.'):
d = defaultdict(int)
for _, _, files in os.walk(dir):
for f in files:
d[os.path.splitext(f)[1]] += 1
return d
if __name__ == '__main__':
d = foo()
for k, v in sorted(d.items()):
print k, v

Python: File formatting

I have a for loop which references a dictionary and prints out the value associated with the key. Code is below:
for i in data:
if i in dict:
print dict[i],
How would i format the output so a new line is created every 60 characters? and with the character count along the side for example:
0001
MRQLLLISDLDNTWVGDQQALEHLQEYLGDRRGNFYLAYATGRSYHSARELQKQVGLMEP
0061
DYWLTAVGSEIYHPEGLDQHWADYLSEHWQRDILQAIADGFEALKPQSPLEQNPWKISYH
0121 LDPQACPTVIDQLTEMLKETGIPVQVIFSSGKDVDLLPQRSNKGNATQYLQQHLAMEPSQ
It's a finicky formatting problem, but I think the following code:
import sys
class EveryN(object):
def __init__(self, n, outs):
self.n = n # chars/line
self.outs = outs # output stream
self.numo = 1 # next tag to write
self.tll = 0 # tot chars on this line
def write(self, s):
while True:
if self.tll == 0: # start of line: emit tag
self.outs.write('%4.4d ' % self.numo)
self.numo += self.n
# wite up to N chars/line, no more
numw = min(len(s), self.n - self.tll)
self.outs.write(s[:numw])
self.tll += numw
if self.tll >= self.n:
self.tll = 0
self.outs.write('\n')
s = s[numw:]
if not s: break
if __name__ == '__main__':
sys.stdout = EveryN(60, sys.stdout)
for i, a in enumerate('abcdefgh'):
print a*(5+ i*5),
shows how to do it -- the output when running for demonstration purposes as the main script (five a's, ten b's, etc, with spaces in-between) is:
0001 aaaaa bbbbbbbbbb ccccccccccccccc dddddddddddddddddddd eeeeee
0061 eeeeeeeeeeeeeeeeeee ffffffffffffffffffffffffffffff ggggggggg
0121 gggggggggggggggggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
0181 hhhhhhh
# test data
data = range(10)
the_dict = dict((i, str(i)*200) for i in range( 10 ))
# your loops as a generator
lines = ( the_dict[i] for i in data if i in the_dict )
def format( line ):
def splitter():
k = 0
while True:
r = line[k:k+60] # take a 60 char block
if r: # if there are any chars left
yield "%04d %s" % (k+1, r) # format them
else:
break
k += 60
return '\n'.join(splitter()) # join all the numbered blocks
for line in lines:
print format(line)
I haven't tested it on actual data, but I believe the code below would do the job. It first builds up the whole string, then outputs it a 60-character line at a time. It uses the three-argument version of range() to count by 60.
s = ''.join(dict[i] for i in data if i in dict)
for i in range(0, len(s), 60):
print '%04d %s' % (i+1, s[i:i+60])
It seems like you're looking for textwrap
The textwrap module provides two convenience functions, wrap() and
fill(), as well as TextWrapper, the class that does all the work, and
a utility function dedent(). If you’re just wrapping or filling one or
two text strings, the convenience functions should be good enough;
otherwise, you should use an instance of TextWrapper for efficiency.

Categories

Resources