I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me
A function to return human readable size from bytes size:
>>> human_readable(2048)
'2 kilobytes'
>>>
How to do this?
Addressing the above "too small a task to require a library" issue by a straightforward implementation (using f-strings, so Python 3.6+):
def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1024.0
return f"{num:.1f}Yi{suffix}"
Supports:
all currently known binary prefixes
negative and positive numbers
numbers larger than 1000 Yobibytes
arbitrary units (maybe you like to count in Gibibits!)
Example:
>>> sizeof_fmt(168963795964)
'157.4GiB'
by Fred Cirera
A library that has all the functionality that it seems you're looking for is humanize. humanize.naturalsize() seems to do everything you're looking for.
Example code (python 3.10)
import humanize
disk_sizes_list = [1, 100, 999, 1000,1024, 2000,2048, 3000, 9999, 10000, 2048000000, 9990000000, 9000000000000000000000]
for size in disk_sizes_list:
natural_size = humanize.naturalsize(size)
binary_size = humanize.naturalsize(size, binary=True)
print(f" {natural_size} \t| {binary_size}\t|{size}")
Output
1 Byte | 1 Byte |1
100 Bytes | 100 Bytes |100
999 Bytes | 999 Bytes |999
1.0 kB | 1000 Bytes |1000
1.0 kB | 1.0 KiB |1024
2.0 kB | 2.0 KiB |2000
2.0 kB | 2.0 KiB |2048
3.0 kB | 2.9 KiB |3000
10.0 kB | 9.8 KiB |9999
10.0 kB | 9.8 KiB |10000
2.0 GB | 1.9 GiB |2048000000
10.0 GB | 9.3 GiB |9990000000
9.0 ZB | 7.6 ZiB |9000000000000000000000
The following works in Python 3.6+, is, in my opinion, the easiest to understand answer on here, and lets you customize the amount of decimal places used.
def human_readable_size(size, decimal_places=2):
for unit in ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']:
if size < 1024.0 or unit == 'PiB':
break
size /= 1024.0
return f"{size:.{decimal_places}f} {unit}"
There's always got to be one of those guys. Well today it's me. Here's a one-liner -- or two lines if you count the function signature.
def human_size(bytes, units=[' bytes','KB','MB','GB','TB', 'PB', 'EB']):
""" Returns a human readable string representation of bytes """
return str(bytes) + units[0] if bytes < 1024 else human_size(bytes>>10, units[1:])
>>> human_size(123)
123 bytes
>>> human_size(123456789)
117GB
If you need sizes bigger than an Exabyte, it's a little bit more gnarly:
def human_size(bytes, units=[' bytes','KB','MB','GB','TB', 'PB', 'EB']):
return str(bytes) + units[0] if bytes < 1024 else human_size(bytes>>10, units[1:]) if units[1:] else f'{bytes>>10}ZB'
Here's my version. It does not use a for-loop. It has constant complexity, O(1), and is in theory more efficient than the answers here that use a for-loop.
from math import log
unit_list = zip(['bytes', 'kB', 'MB', 'GB', 'TB', 'PB'], [0, 0, 1, 2, 2, 2])
def sizeof_fmt(num):
"""Human friendly file size"""
if num > 1:
exponent = min(int(log(num, 1024)), len(unit_list) - 1)
quotient = float(num) / 1024**exponent
unit, num_decimals = unit_list[exponent]
format_string = '{:.%sf} {}' % (num_decimals)
return format_string.format(quotient, unit)
if num == 0:
return '0 bytes'
if num == 1:
return '1 byte'
To make it more clear what is going on, we can omit the code for the string formatting. Here are the lines that actually do the work:
exponent = int(log(num, 1024))
quotient = num / 1024**exponent
unit_list[exponent]
I recently came up with a version that avoids loops, using log2 to determine the size order which doubles as a shift and an index into the suffix list:
from math import log2
_suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
def file_size(size):
# determine binary order in steps of size 10
# (coerce to int, // still returns a float)
order = int(log2(size) / 10) if size else 0
# format file size
# (.4g results in rounded numbers for exact matches and max 3 decimals,
# should never resort to exponent values)
return '{:.4g} {}'.format(size / (1 << (order * 10)), _suffixes[order])
Could well be considered unpythonic for its readability, though.
If you're using Django installed you can also try filesizeformat:
from django.template.defaultfilters import filesizeformat
filesizeformat(1073741824)
=>
"1.0 GB"
You should use "humanize".
>>> humanize.naturalsize(1000000)
'1.0 MB'
>>> humanize.naturalsize(1000000, binary=True)
'976.6 KiB'
>>> humanize.naturalsize(1000000, gnu=True)
'976.6K'
Reference:
https://pypi.org/project/humanize/
One such library is hurry.filesize.
>>> from hurry.filesize import alternative
>>> size(1, system=alternative)
'1 byte'
>>> size(10, system=alternative)
'10 bytes'
>>> size(1024, system=alternative)
'1 KB'
Using either powers of 1000 or kibibytes would be more standard-friendly:
def sizeof_fmt(num, use_kibibyte=True):
base, suffix = [(1000.,'B'),(1024.,'iB')][use_kibibyte]
for x in ['B'] + map(lambda x: x+suffix, list('kMGTP')):
if -base < num < base:
return "%3.1f %s" % (num, x)
num /= base
return "%3.1f %s" % (num, x)
P.S. Never trust a library that prints thousands with the K (uppercase) suffix :)
The HumanFriendly project helps with this.
import humanfriendly
humanfriendly.format_size(1024)
The above code will give 1KB as answer.
Examples can be found here.
Riffing on the snippet provided as an alternative to hurry.filesize(), here is a snippet that gives varying precision numbers based on the prefix used. It isn't as terse as some snippets, but I like the results.
def human_size(size_bytes):
"""
format a size in bytes into a 'human' file size, e.g. bytes, KB, MB, GB, TB, PB
Note that bytes/KB will be reported in whole numbers but MB and above will have greater precision
e.g. 1 byte, 43 bytes, 443 KB, 4.3 MB, 4.43 GB, etc
"""
if size_bytes == 1:
# because I really hate unnecessary plurals
return "1 byte"
suffixes_table = [('bytes',0),('KB',0),('MB',1),('GB',2),('TB',2), ('PB',2)]
num = float(size_bytes)
for suffix, precision in suffixes_table:
if num < 1024.0:
break
num /= 1024.0
if precision == 0:
formatted_size = "%d" % num
else:
formatted_size = str(round(num, ndigits=precision))
return "%s %s" % (formatted_size, suffix)
This will do what you need in almost any situation, is customizable with optional arguments, and as you can see, is pretty much self-documenting:
from math import log
def pretty_size(n,pow=0,b=1024,u='B',pre=['']+[p+'i'for p in'KMGTPEZY']):
pow,n=min(int(log(max(n*b**pow,1),b)),len(pre)-1),n*b**pow
return "%%.%if %%s%%s"%abs(pow%(-pow-1))%(n/b**float(pow),pre[pow],u)
Example output:
>>> pretty_size(42)
'42 B'
>>> pretty_size(2015)
'2.0 KiB'
>>> pretty_size(987654321)
'941.9 MiB'
>>> pretty_size(9876543210)
'9.2 GiB'
>>> pretty_size(0.5,pow=1)
'512 B'
>>> pretty_size(0)
'0 B'
Advanced customizations:
>>> pretty_size(987654321,b=1000,u='bytes',pre=['','kilo','mega','giga'])
'987.7 megabytes'
>>> pretty_size(9876543210,b=1000,u='bytes',pre=['','kilo','mega','giga'])
'9.9 gigabytes'
This code is both Python 2 and Python 3 compatible. PEP8 compliance is an exercise for the reader. Remember, it's the output that's pretty.
Update:
If you need thousands commas, just apply the obvious extension:
def prettier_size(n,pow=0,b=1024,u='B',pre=['']+[p+'i'for p in'KMGTPEZY']):
r,f=min(int(log(max(n*b**pow,1),b)),len(pre)-1),'{:,.%if} %s%s'
return (f%(abs(r%(-r-1)),pre[r],u)).format(n*b**pow/b**float(r))
For example:
>>> pretty_units(987654321098765432109876543210)
'816,968.5 YiB'
Drawing from all the previous answers, here is my take on it. It's an object which will store the file size in bytes as an integer. But when you try to print the object, you automatically get a human readable version.
class Filesize(object):
"""
Container for a size in bytes with a human readable representation
Use it like this::
>>> size = Filesize(123123123)
>>> print size
'117.4 MB'
"""
chunk = 1024
units = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB']
precisions = [0, 0, 1, 2, 2, 2]
def __init__(self, size):
self.size = size
def __int__(self):
return self.size
def __str__(self):
if self.size == 0: return '0 bytes'
from math import log
unit = self.units[min(int(log(self.size, self.chunk)), len(self.units) - 1)]
return self.format(unit)
def format(self, unit):
if unit not in self.units: raise Exception("Not a valid file size unit: %s" % unit)
if self.size == 1 and unit == 'bytes': return '1 byte'
exponent = self.units.index(unit)
quotient = float(self.size) / self.chunk**exponent
precision = self.precisions[exponent]
format_string = '{:.%sf} {}' % (precision)
return format_string.format(quotient, unit)
Modern Django have self template tag filesizeformat:
Formats the value like a human-readable file size (i.e. '13 KB', '4.1 MB', '102 bytes', etc.).
For example:
{{ value|filesizeformat }}
If value is 123456789, the output would be 117.7 MB.
More info: https://docs.djangoproject.com/en/1.10/ref/templates/builtins/#filesizeformat
I like the fixed precision of senderle's decimal version, so here's a sort of hybrid of that with joctee's answer above (did you know you could take logs with non-integer bases?):
from math import log
def human_readable_bytes(x):
# hybrid of https://stackoverflow.com/a/10171475/2595465
# with https://stackoverflow.com/a/5414105/2595465
if x == 0: return '0'
magnitude = int(log(abs(x),10.24))
if magnitude > 16:
format_str = '%iP'
denominator_mag = 15
else:
float_fmt = '%2.1f' if magnitude % 3 == 1 else '%1.2f'
illion = (magnitude + 1) // 3
format_str = float_fmt + ['', 'K', 'M', 'G', 'T', 'P'][illion]
return (format_str % (x * 1.0 / (1024 ** illion))).lstrip('0')
To get the file size in a human readable form, I created this function:
import os
def get_size(path):
size = os.path.getsize(path)
if size < 1024:
return f"{size} bytes"
elif size < pow(1024,2):
return f"{round(size/1024, 2)} KB"
elif size < pow(1024,3):
return f"{round(size/(pow(1024,2)), 2)} MB"
elif size < pow(1024,4):
return f"{round(size/(pow(1024,3)), 2)} GB"
>>> get_size("a.txt")
1.4KB
Here is an oneliner lambda without any imports to convert to human readable filesize. Pass the value in bytes.
to_human = lambda v : str(v >> ((max(v.bit_length()-1, 0)//10)*10)) +["", "K", "M", "G", "T", "P", "E"][max(v.bit_length()-1, 0)//10]
>>> to_human(1024)
'1K'
>>> to_human(1024*1024*3)
'3M'
How about a simple 2 liner:
def humanizeFileSize(filesize):
p = int(math.floor(math.log(filesize, 2)/10))
return "%.3f%s" % (filesize/math.pow(1024,p), ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][p])
Here is how it works under the hood:
Calculates log2(filesize)
Divides it by 10 to get the closest unit. (eg if size is 5000 bytes, the closest unit is Kb, so the answer should be X KiB)
Returns file_size/value_of_closest_unit along with unit.
It however doesn't work if filesize is 0 or negative (because log is undefined for 0 and -ve numbers). You can add extra checks for them:
def humanizeFileSize(filesize):
filesize = abs(filesize)
if (filesize==0):
return "0 Bytes"
p = int(math.floor(math.log(filesize, 2)/10))
return "%0.2f %s" % (filesize/math.pow(1024,p), ['Bytes','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][p])
Examples:
>>> humanizeFileSize(538244835492574234)
'478.06 PiB'
>>> humanizeFileSize(-924372537)
'881.55 MiB'
>>> humanizeFileSize(0)
'0 Bytes'
NOTE - There is a difference between Kb and KiB. KB means 1000 bytes, whereas KiB means 1024 bytes. KB,MB,GB are all multiples of 1000, whereas KiB, MiB, GiB etc are all multiples of 1024. More about it here
What you're about to find below is by no means the most performant or shortest solution among the ones already posted. Instead, it focuses on one particular issue that many of the other answers miss.
Namely the case when input like 999_995 is given:
Python 3.6.1 ...
...
>>> value = 999_995
>>> base = 1000
>>> math.log(value, base)
1.999999276174054
which, being truncated to the nearest integer and applied back to the input gives
>>> order = int(math.log(value, base))
>>> value/base**order
999.995
This seems to be exactly what we'd expect until we're required to control output precision. And this is when things start to get a bit difficult.
With the precision set to 2 digits we get:
>>> round(value/base**order, 2)
1000 # K
instead of 1M.
How can we counter that?
Of course, we can check for it explicitly:
if round(value/base**order, 2) == base:
order += 1
But can we do better? Can we get to know which way the order should be cut before we do the final step?
It turns out we can.
Assuming 0.5 decimal rounding rule, the above if condition translates into:
resulting in
def abbreviate(value, base=1000, precision=2, suffixes=None):
if suffixes is None:
suffixes = ['', 'K', 'M', 'B', 'T']
if value == 0:
return f'{0}{suffixes[0]}'
order_max = len(suffixes) - 1
order = log(abs(value), base)
order_corr = order - int(order) >= log(base - 0.5/10**precision, base)
order = min(int(order) + order_corr, order_max)
factored = round(value/base**order, precision)
return f'{factored:,g}{suffixes[order]}'
giving
>>> abbreviate(999_994)
'999.99K'
>>> abbreviate(999_995)
'1M'
>>> abbreviate(999_995, precision=3)
'999.995K'
>>> abbreviate(2042, base=1024)
'1.99K'
>>> abbreviate(2043, base=1024)
'2K'
def human_readable_data_quantity(quantity, multiple=1024):
if quantity == 0:
quantity = +0
SUFFIXES = ["B"] + [i + {1000: "B", 1024: "iB"}[multiple] for i in "KMGTPEZY"]
for suffix in SUFFIXES:
if quantity < multiple or suffix == SUFFIXES[-1]:
if suffix == SUFFIXES[0]:
return "%d%s" % (quantity, suffix)
else:
return "%.1f%s" % (quantity, suffix)
else:
quantity /= multiple
This feature if available in Boltons which is a very handy library to have for most projects.
>>> bytes2human(128991)
'126K'
>>> bytes2human(100001221)
'95M'
>>> bytes2human(0, 2)
'0.00B'
Here's something I wrote for a different question...
Much like xApple's answer, this object will always print in a human-readable format. The difference is that it's also a proper int, so you can do math with it!
It passes the format specifier straight through to the number format and tacks on the suffix, so it's pretty much guaranteed that the requested length will be exceeded by two or three characters. I've never had a use for this code, so I haven't bothered to fix it!
class ByteSize(int):
_KB = 1024
_suffixes = 'B', 'KB', 'MB', 'GB', 'PB'
def __new__(cls, *args, **kwargs):
return super().__new__(cls, *args, **kwargs)
def __init__(self, *args, **kwargs):
self.bytes = self.B = int(self)
self.kilobytes = self.KB = self / self._KB**1
self.megabytes = self.MB = self / self._KB**2
self.gigabytes = self.GB = self / self._KB**3
self.petabytes = self.PB = self / self._KB**4
*suffixes, last = self._suffixes
suffix = next((
suffix
for suffix in suffixes
if 1 < getattr(self, suffix) < self._KB
), last)
self.readable = suffix, getattr(self, suffix)
super().__init__()
def __str__(self):
return self.__format__('.2f')
def __repr__(self):
return '{}({})'.format(self.__class__.__name__, super().__repr__())
def __format__(self, format_spec):
suffix, val = self.readable
return '{val:{fmt}} {suf}'.format(val=val, fmt=format_spec, suf=suffix)
def __sub__(self, other):
return self.__class__(super().__sub__(other))
def __add__(self, other):
return self.__class__(super().__add__(other))
def __mul__(self, other):
return self.__class__(super().__mul__(other))
def __rsub__(self, other):
return self.__class__(super().__sub__(other))
def __radd__(self, other):
return self.__class__(super().__add__(other))
def __rmul__(self, other):
return self.__class__(super().__rmul__(other))
Usage:
>>> size = 6239397620
>>> print(size)
5.81 GB
>>> size.GB
5.810891855508089
>>> size.gigabytes
5.810891855508089
>>> size.PB
0.005674699077644618
>>> size.MB
5950.353260040283
>>> size
ByteSize(6239397620)
In case someone is wondering, to convert #Sridhar Ratnakumar's answer back to bytes you could do the following:
import math
def format_back_to_bytes(value):
for power, unit in enumerate(["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]):
if value[-3:-1] == unit:
return round(float(value[:-3])*math.pow(2, 10*power))
Usage:
>>> format_back_to_bytes('212.4GiB')
228062763418
Here is an option using while:
def number_format(n):
n2, n3 = n, 0
while n2 >= 1e3:
n2 /= 1e3
n3 += 1
return '%.3f' % n2 + ('', ' k', ' M', ' G')[n3]
s = number_format(9012345678)
print(s == '9.012 G')
https://docs.python.org/reference/compound_stmts.html#while
Referencing Sridhar Ratnakumar's answer, updated to:
def formatSize(sizeInBytes, decimalNum=1, isUnitWithI=False, sizeUnitSeperator=""):
"""format size to human readable string"""
# https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000
# K=kilo, M=mega, G=giga, T=tera, P=peta, E=exa, Z=zetta, Y=yotta
sizeUnitList = ['','K','M','G','T','P','E','Z']
largestUnit = 'Y'
if isUnitWithI:
sizeUnitListWithI = []
for curIdx, eachUnit in enumerate(sizeUnitList):
unitWithI = eachUnit
if curIdx >= 1:
unitWithI += 'i'
sizeUnitListWithI.append(unitWithI)
# sizeUnitListWithI = ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']
sizeUnitList = sizeUnitListWithI
largestUnit += 'i'
suffix = "B"
decimalFormat = "." + str(decimalNum) + "f" # ".1f"
finalFormat = "%" + decimalFormat + sizeUnitSeperator + "%s%s" # "%.1f%s%s"
sizeNum = sizeInBytes
for sizeUnit in sizeUnitList:
if abs(sizeNum) < 1024.0:
return finalFormat % (sizeNum, sizeUnit, suffix)
sizeNum /= 1024.0
return finalFormat % (sizeNum, largestUnit, suffix)
and example output is:
def testKb():
kbSize = 3746
kbStr = formatSize(kbSize)
print("%s -> %s" % (kbSize, kbStr))
def testI():
iSize = 87533
iStr = formatSize(iSize, isUnitWithI=True)
print("%s -> %s" % (iSize, iStr))
def testSeparator():
seperatorSize = 98654
seperatorStr = formatSize(seperatorSize, sizeUnitSeperator=" ")
print("%s -> %s" % (seperatorSize, seperatorStr))
def testBytes():
bytesSize = 352
bytesStr = formatSize(bytesSize)
print("%s -> %s" % (bytesSize, bytesStr))
def testMb():
mbSize = 76383285
mbStr = formatSize(mbSize, decimalNum=2)
print("%s -> %s" % (mbSize, mbStr))
def testTb():
tbSize = 763832854988542
tbStr = formatSize(tbSize, decimalNum=2)
print("%s -> %s" % (tbSize, tbStr))
def testPb():
pbSize = 763832854988542665
pbStr = formatSize(pbSize, decimalNum=4)
print("%s -> %s" % (pbSize, pbStr))
def demoFormatSize():
testKb()
testI()
testSeparator()
testBytes()
testMb()
testTb()
testPb()
# 3746 -> 3.7KB
# 87533 -> 85.5KiB
# 98654 -> 96.3 KB
# 352 -> 352.0B
# 76383285 -> 72.84MB
# 763832854988542 -> 694.70TB
# 763832854988542665 -> 678.4199PB
This solution might also appeal to you, depending on how your mind works:
from pathlib import Path
def get_size(path = Path('.')):
""" Gets file size, or total directory size """
if path.is_file():
size = path.stat().st_size
elif path.is_dir():
size = sum(file.stat().st_size for file in path.glob('*.*'))
return size
def format_size(path, unit="MB"):
""" Converts integers to common size units used in computing """
bit_shift = {"B": 0,
"kb": 7,
"KB": 10,
"mb": 17,
"MB": 20,
"gb": 27,
"GB": 30,
"TB": 40,}
return "{:,.0f}".format(get_size(path) / float(1 << bit_shift[unit])) + " " + unit
# Tests and test results
>>> get_size("d:\\media\\bags of fun.avi")
'38 MB'
>>> get_size("d:\\media\\bags of fun.avi","KB")
'38,763 KB'
>>> get_size("d:\\media\\bags of fun.avi","kb")
'310,104 kb'
I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this.
'input.dat' file:
Z0: 0 0
k: 0.1
g: 1
Delta: 20
t_end: 300
The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword?
def load(self,filename="input.dat"):
FILE = open(filename)
s = FILE.readline().split()
if len(s) is 3:
self.z0 = [float(s[1]),float(s[2])] # initial state
s = FILE.readline().split()
if len(s) is 2:
self.k = float(s[1]) # kappa
s = FILE.readline().split()
if len(s) is 2:
self.g = float(s[1])
s = FILE.readline().split()
if len(s) is 2:
self.D = float(s[1]) # Delta
s = FILE.readline().split()
if len(s) is 2:
self.T = float(s[1]) # end time
Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py:
Z0 = (0, 0)
k = 0.1
g = 1
Delta = 20
t_end = 300
Then in your code all you need is:
import params
fancy_calculation(10, k=params.k, delta=params.Delta)
The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example:
k = 0.1
Delta = 20
g = 3 * k + Delta
Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.
If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file.
The Python library is PyYAML. This is how you can easily use it with Python.
For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML.
The benefit is you can read the parameter values as lists or maps.
You would love it!
Try the following:
def load(self, filename="input.dat"):
d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"}
FILE = open(filename)
for line in FILE:
name, value = line.split(":")
value = value.strip()
if " " in value:
value = map(float, value.split())
else:
value = float(value)
setattr(self, d[name], value)
Proof that it works:
>>> class A(object): pass
...
>>> a = A()
>>> load(a)
>>> a.__dict__
{'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}
As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible.
# maps label to attribute name and types
label_attr_map = {
"Z0:": ["z0", float, float],
"k:": [ "k", float],
"g:": [ "g", float],
"Delta:": [ "D", float],
"t_end:": [ "T", float]
}
class Params(object):
def __init__(self, input_file_name):
with open(input_file_name, 'r') as input_file:
for line in input_file:
row = line.split()
label = row[0]
data = row[1:] # rest of row is data list
attr = label_attr_map[label][0]
datatypes = label_attr_map[label][1:]
values = [(datatypes[i](data[i])) for i in range(len(data))]
self.__dict__[attr] = values if len(values) > 1 else values[0]
params = Params('input.dat')
print 'params.z0:', params.z0
print 'params.k:', params.k
print 'params.g:', params.g
print 'params.D:', params.D
print 'params.T:', params.T
Output:
params.z0: [0.0, 0.0]
params.k: 0.1
params.g: 1.0
params.D: 20.0
params.T: 300.0
Perhaps this might give you what you need:
def load(self,filename='input.dat'):
with open(filename) as fh:
for line in fh:
s = line.split()
if len(s) == 2:
setattr(self,s[1],s[2])
elif len(s) == 3:
setattr(self,s[1],s[2:])
I also didn't include any error checking, but setattr is very handy.
Something like this:
def load(self,filename="input.dat"):
# maps names to number of fields they need
# only necessary for variables with more than 1 field
argmap = dict(Z0=2)
# maps config file names to their attribute names on the object
# if name is the same both places, no need
namemap = dict(Z0="z0", Delta="D", t_end="T")
with open(filename) as FILE:
for line in FILE:
s = line.split()
var = s[0].rstrip(":")
try:
val = [float(x) for x in s[1:]]
except ValueError:
continue
if len(val) == varmap.get(var, 1):
if len(val) == 1:
val = val[0]
setattr(self, namemap.get(var, var), val)
Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key.
class Data(object):
def __init__(self, path='infile.dat'):
with open(path, 'r') as fo:
for line in fo.readlines():
if len(line) < 2: continue
parts = [s.strip(' :\n') for s in line.split(' ', 1)]
numbers = [float(s) for s in parts[1].split()]
# This is optional... do you want single values to be stored in lists?
if len(numbers) == 1: numbers = numbers[0]
self.__dict__[parts[0]] = numbers
# print parts -- debug
obj = Data('infile.dat')
print obj.g
print obj.Delta
print obj.Z0
At the end of this, we print out a few of the keys. Here's the output of those.
1.0
20.0
[0.0, 0.0]
For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.
Here's another one
def splitstrip(s):
return s.split(':')[1].strip()
with open('input.dat','r') as f:
a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')]
a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')])
;)
I want to parse srt subtitles:
1
00:00:12,815 --> 00:00:14,509
Chlapi, jak to jde s
těma pracovníma světlama?.
2
00:00:14,815 --> 00:00:16,498
Trochu je zesilujeme.
3
00:00:16,934 --> 00:00:17,814
Jo, sleduj.
Every item into structure. With this regexs:
A:
RE_ITEM = re.compile(r'(?P<index>\d+).'
r'(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> '
r'(?P<end>\d{2}:\d{2}:\d{2},\d{3}).'
r'(?P<text>.*?)', re.DOTALL)
B:
RE_ITEM = re.compile(r'(?P<index>\d+).'
r'(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> '
r'(?P<end>\d{2}:\d{2}:\d{2},\d{3}).'
r'(?P<text>.*)', re.DOTALL)
And this code:
for i in Subtitles.RE_ITEM.finditer(text):
result.append((i.group('index'), i.group('start'),
i.group('end'), i.group('text')))
With code B I have only one item in array (because of greedy .*) and with code A I have empty 'text' because of no-greedy .*?
How to cure this?
Thanks
Why not use pysrt?
I became quite frustrated with srt libraries available for Python (often because they were heavyweight and eschewed language-standard types in favour of custom classes), so I've spent the last year or so working on my own srt library. You can get it at https://github.com/cdown/srt.
I tried to keep it simple and light on classes (except for the core Subtitle class, which more or less just stores the SRT block data). It can read and write SRT files, and turn noncompliant SRT files into compliant ones.
Here's a usage example with your sample input:
>>> import srt, pprint
>>> gen = srt.parse('''\
... 1
... 00:00:12,815 --> 00:00:14,509
... Chlapi, jak to jde s
... těma pracovníma světlama?.
...
... 2
... 00:00:14,815 --> 00:00:16,498
... Trochu je zesilujeme.
...
... 3
... 00:00:16,934 --> 00:00:17,814
... Jo, sleduj.
...
... ''')
>>> pprint.pprint(list(gen))
[Subtitle(start=datetime.timedelta(0, 12, 815000), end=datetime.timedelta(0, 14, 509000), index=1, proprietary='', content='Chlapi, jak to jde s\ntěma pracovníma světlama?.'),
Subtitle(start=datetime.timedelta(0, 14, 815000), end=datetime.timedelta(0, 16, 498000), index=2, proprietary='', content='Trochu je zesilujeme.'),
Subtitle(start=datetime.timedelta(0, 16, 934000), end=datetime.timedelta(0, 17, 814000), index=3, proprietary='', content='Jo, sleduj.')]
The text is followed by an empty line, or the end of file. So you can use:
r' .... (?P<text>.*?)(\n\n|$)'
Here's some code I had lying around to parse SRT files:
from __future__ import division
import datetime
class Srt_entry(object):
def __init__(self, lines):
def parsetime(string):
hours, minutes, seconds = string.split(u':')
hours = int(hours)
minutes = int(minutes)
seconds = float(u'.'.join(seconds.split(u',')))
return datetime.timedelta(0, seconds, 0, 0, minutes, hours)
self.index = int(lines[0])
start, arrow, end = lines[1].split()
self.start = parsetime(start)
if arrow != u"-->":
raise ValueError
self.end = parsetime(end)
self.lines = lines[2:]
if not self.lines[-1]:
del self.lines[-1]
def __unicode__(self):
def delta_to_string(d):
hours = (d.days * 24) \
+ (d.seconds // (60 * 60))
minutes = (d.seconds // 60) % 60
seconds = d.seconds % 60 + d.microseconds / 1000000
return u','.join((u"%02d:%02d:%06.3f"
% (hours, minutes, seconds)).split(u'.'))
return (unicode(self.index) + u'\n'
+ delta_to_string(self.start)
+ ' --> '
+ delta_to_string(self.end) + u'\n'
+ u''.join(self.lines))
srt_file = open("foo.srt")
entries = []
entry = []
for line in srt_file:
if options.decode:
line = line.decode(options.decode)
if line == u'\n':
entries.append(Srt_entry(entry))
entry = []
else:
entry.append(line)
srt_file.close()
splits = [s.strip() for s in re.split(r'\n\s*\n', text) if s.strip()]
regex = re.compile(r'''(?P<index>\d+).*?(?P<start>\d{2}:\d{2}:\d{2},\d{3}) --> (?P<end>\d{2}:\d{2}:\d{2},\d{3})\s*.*?\s*(?P<text>.*)''', re.DOTALL)
for s in splits:
r = regex.search(s)
print r.groups()
Here's a snippet I wrote which converts SRT files into dictionaries:
import re
def srt_time_to_seconds(time):
split_time=time.split(',')
major, minor = (split_time[0].split(':'), split_time[1])
return int(major[0])*1440 + int(major[1])*60 + int(major[2]) + float(minor)/1000
def srt_to_dict(srtText):
subs=[]
for s in re.sub('\r\n', '\n', srtText).split('\n\n'):
st = s.split('\n')
if len(st)>=3:
split = st[1].split(' --> ')
subs.append({'start': srt_time_to_seconds(split[0].strip()),
'end': srt_time_to_seconds(split[1].strip()),
'text': '<br />'.join(j for j in st[2:len(st)])
})
return subs
Usage:
import srt_to_dict
with open('test.srt', "r") as f:
srtText = f.read()
print srt_to_dict(srtText)