Optimization of numpy array iteration

Optimization of numpy array iteration - python

I'm trying to optimize the performance of my python program and I think I have identified this piece of code as bottleneck:
for i in range(len(green_list)):
rgb_list = []
for j in range(len(green_list[i])):
rgb_list.append('%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j]))
write_file(str(i), rgb_list)
Where red_list, green_list and blue_list are numpy arrays with values like this:
red_list = [[1, 2, 3, 4, 5], [51, 52, 53, 54, 55]]
green_list = [[6, 7, 8, 9, 10], [56, 57, 58, 59, 60]]
blue_list = [[11, 12, 13, 14, 15], [61, 62, 63, 64, 65]]
At the end of each execution of the inner-for rgb_list is containing the hex values:
rgb_list = ['01060b', '02070c', '03080d', '04090e', '050a01']
Now, it is not clear to me how to exploit the potential of numpy arrays but I think there is a way to optimize those two nested loops. Any suggestions?

I assume the essential traits of your code could be summarized in the following generator:
import numpy as np
def as_str_OP(r_arr, g_arr, b_arr):
n, m = r_arr.shape
rgbs = []
for i in range(n):
rgb = []
for j in range(m):
rgb.append('%02x%02x%02x' % (r_arr[i, j], g_arr[i, j], b_arr[i, j]))
yield rgb
which can be consumed with a for loop, for example to write to disk:
for x in as_str_OP(r_arr, g_arr, b_arr):
write_to_disk(x)
The generator itself can be written either with the core computation vectorized in Python or in a Numba-friendly way.
The key is to replace the relatively slow string interpolation with a int-to-hex custom-made computation.
This results in substantial speed-up, especially as the size of the input grows (and particularly the second dimension).
Below is the NumPy-vectorized version:
def as_str_np(r_arr, g_arr, b_arr):
l = 3
n, m = r_arr.shape
rgbs = []
for i in range(n):
rgb = np.empty((m, 2 * l), dtype=np.uint32)
r0, r1 = divmod(r_arr[i, :], 16)
g0, g1 = divmod(g_arr[i, :], 16)
b0, b1 = divmod(b_arr[i, :], 16)
rgb[:, 0] = hex_to_ascii(r0)
rgb[:, 1] = hex_to_ascii(r1)
rgb[:, 2] = hex_to_ascii(g0)
rgb[:, 3] = hex_to_ascii(g1)
rgb[:, 4] = hex_to_ascii(b0)
rgb[:, 5] = hex_to_ascii(b1)
yield rgb.view(f'<U{2 * l}').reshape(m).tolist()
and the Numba-accelerated version:
import numba as nb
#nb.njit
def hex_to_ascii(x):
ascii_num_offset = 48 # ord(b'0') == 48
ascii_alp_offset = 87 # ord(b'a') == 97, (num of non-alpha digits) == 10
return x + (ascii_num_offset if x < 10 else ascii_alp_offset)
#nb.njit
def _to_hex_2d(x):
a, b = divmod(x, 16)
return hex_to_ascii(a), hex_to_ascii(b)
#nb.njit
def _as_str_nb(r_arr, g_arr, b_arr):
l = 3
n, m = r_arr.shape
for i in range(n):
rgb = np.empty((m, 2 * l), dtype=np.uint32)
for j in range(m):
rgb[j, 0:2] = _to_hex_2d(r_arr[i, j])
rgb[j, 2:4] = _to_hex_2d(g_arr[i, j])
rgb[j, 4:6] = _to_hex_2d(b_arr[i, j])
yield rgb
def as_str_nb(r_arr, g_arr, b_arr):
l = 3
n, m = r_arr.shape
for x in _as_str_nb(r_arr, g_arr, b_arr):
yield x.view(f'<U{2 * l}').reshape(m).tolist()
This essentially involves manually writing the number, correctly converted to hexadecimal ASCII chars, into a properly typed array, which can then be converted to give the desired output.
Note that the final numpy.ndarray.tolist() could be avoided if whatever will consume the generator is capable of dealing with the NumPy array itself, thus saving some potentially large and definitely appreciable time, e.g.:
def as_str_nba(r_arr, g_arr, b_arr):
l = 3
n, m = r_arr.shape
for x in _as_str_nb(r_arr, g_arr, b_arr):
yield x.view(f'<U{2 * l}').reshape(m)
Overcoming IO-bound bottleneck
However, if you are IO-bounded you should modify your code to write in blocks, e.g using the grouper recipe from itertools recipes:
from itertools import zip_longest
def grouper(iterable, n, *, incomplete='fill', fillvalue=None):
"Collect data into non-overlapping fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, fillvalue='x') --> ABC DEF Gxx
# grouper('ABCDEFG', 3, incomplete='strict') --> ABC DEF ValueError
# grouper('ABCDEFG', 3, incomplete='ignore') --> ABC DEF
args = [iter(iterable)] * n
if incomplete == 'fill':
return zip_longest(*args, fillvalue=fillvalue)
if incomplete == 'strict':
return zip(*args, strict=True)
if incomplete == 'ignore':
return zip(*args)
else:
raise ValueError('Expected fill, strict, or ignore')
to be used like:
group_size = 3
for x in grouper(as_str_OP(r_arr, g_arr, b_arr), group_size):
write_many_to_disk(x)
Testing out the output
Some dummy input can be produced easily (r_arr is essentially red_list, etc.):
def gen_color(n, m):
return np.random.randint(0, 2 ** 8, (n, m))
N, M = 10, 3
r_arr = gen_color(N, M)
g_arr = gen_color(N, M)
b_arr = gen_color(N, M)
and tested by consuming the generator to produce a list:
res_OP = list(as_str_OP(r_arr, g_arr, b_arr))
res_np = list(as_str_np(r_arr, g_arr, b_arr))
res_nb = list(as_str_nb(r_arr, g_arr, b_arr))
res_nba = list(as_str_nba(r_arr, g_arr, b_arr))
print(np.array(res_OP))
# [['1f6984' '916d98' 'f9d779']
# ['65f895' 'ded23e' '332fdc']
# ['b9e059' 'ce8676' 'cb75e9']
# ['bca0fc' '3289a9' 'cc3d3a']
# ['6bb0be' '07134a' 'c3cf05']
# ['152d5c' 'bac081' 'c59a08']
# ['97efcc' '4c31c0' '957693']
# ['15247e' 'af8f0a' 'ffb89a']
# ['161333' '8f41ce' '187b01']
# ['d811ae' '730b17' 'd2e269']]
print(res_OP == res_np)
# True
print(res_OP == res_nb)
# True
print(res_OP == [x.tolist() for x in res_nba])
# True
eventually passing through some grouping:
k = 3
res_OP = list(grouper(as_str_OP(r_arr, g_arr, b_arr), k))
res_np = list(grouper(as_str_np(r_arr, g_arr, b_arr), k))
res_nb = list(grouper(as_str_nb(r_arr, g_arr, b_arr), k))
res_nba = list(grouper(as_str_nba(r_arr, g_arr, b_arr), k))
print(np.array(res_OP))
# [[list(['1f6984', '916d98', 'f9d779'])
# list(['65f895', 'ded23e', '332fdc'])
# list(['b9e059', 'ce8676', 'cb75e9'])]
# [list(['bca0fc', '3289a9', 'cc3d3a'])
# list(['6bb0be', '07134a', 'c3cf05'])
# list(['152d5c', 'bac081', 'c59a08'])]
# [list(['97efcc', '4c31c0', '957693'])
# list(['15247e', 'af8f0a', 'ffb89a'])
# list(['161333', '8f41ce', '187b01'])]
# [list(['d811ae', '730b17', 'd2e269']) None None]]
print(res_OP == res_np)
# True
print(res_OP == res_nb)
# True
print(res_OP == [tuple(y.tolist() if y is not None else y for y in x) for x in res_nba])
# True
Benchmarks
To give you some ideas of the numbers we could be talking, let us use %timeit on much larger inputs:
N, M = 1000, 1000
r_arr = gen_color(N, M)
g_arr = gen_color(N, M)
b_arr = gen_color(N, M)
%timeit -n 1 -r 1 list(as_str_OP(r_arr, g_arr, b_arr))
# 1 loop, best of 1: 1.1 s per loop
%timeit -n 4 -r 4 list(as_str_np(r_arr, g_arr, b_arr))
# 4 loops, best of 4: 279 ms per loop
%timeit -n 4 -r 4 list(as_str_nb(r_arr, g_arr, b_arr))
# 1 loop, best of 1: 96.5 ms per loop
%timeit -n 4 -r 4 list(as_str_nba(r_arr, g_arr, b_arr))
# 4 loops, best of 4: 10.4 ms per loop
To simulate disk writing we could use the following consumer:
import time
import math
def consumer(gen, timeout_sec=0.001, weight=1):
result = []
for x in gen:
result.append(x)
time.sleep(timeout_sec * weight)
return result
where disk writing is simulated with a time.sleep() call with a timeout depending on the logarithm of the object size:
N, M = 1000, 1000
r_arr = gen_color(N, M)
g_arr = gen_color(N, M)
b_arr = gen_color(N, M)
%timeit -n 1 -r 1 consumer(as_str_OP(r_arr, g_arr, b_arr), weight=math.log2(2))
# 1 loop, best of 1: 2.37 s per loop
%timeit -n 1 -r 1 consumer(as_str_np(r_arr, g_arr, b_arr), weight=math.log2(2))
# 1 loop, best of 1: 1.48 s per loop
%timeit -n 1 -r 1 consumer(as_str_nb(r_arr, g_arr, b_arr), weight=math.log2(2))
# 1 loop, best of 1: 1.27 s per loop
%timeit -n 1 -r 1 consumer(as_str_nba(r_arr, g_arr, b_arr), weight=math.log2(2))
# 1 loop, best of 1: 1.13 s per loop
k = 100
%timeit -n 1 -r 1 consumer(grouper(as_str_OP(r_arr, g_arr, b_arr), k), weight=math.log2(1 + k))
# 1 loop, best of 1: 1.17 s per loop
%timeit -n 1 -r 1 consumer(grouper(as_str_np(r_arr, g_arr, b_arr), k), weight=math.log2(1 + k))
# 1 loop, best of 1: 368 ms per loop
%timeit -n 1 -r 1 consumer(grouper(as_str_nb(r_arr, g_arr, b_arr), k), weight=math.log2(1 + k))
# 1 loop, best of 1: 173 ms per loop
%timeit -n 1 -r 1 consumer(grouper(as_str_nba(r_arr, g_arr, b_arr), k), weight=math.log2(1 + k))
# 1 loop, best of 1: 87.4 ms per loop
Ignoring the disk-writing simulation, the NumPy-vectorized approach is ~4x faster with the test input sizes, while Numba-accelerated approach gets ~10x to ~100x faster depending on whether the potentially useless conversion to list() with numpy.ndarray.tolist() is present or not.
When it comes to the simulated disk-writing, the faster versions are all more or less equivalent, and noticeably less effective without grouping, resulting in ~2x speed-up.
With grouping alone the speed-up gets to be ~2x, but when combining it with the faster approaches, the speed-ups fare between ~3x of the NumPy-vectorized version and the ~7x or ~13x of the Numba-accelerated approaches (with or without numpy.ndarray.tolist()).
Again, this is with the given input, and under the test conditions.
The actual mileage may vary.

you could use reduce for the inner loop, making it possible for your computer to divide the computations between different threads behind the scenes
for i in range(len(green_list)):
rgb_list = reduce(lambda ls, j: ls + ['%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j])],range(len(green_list[i])),list())
print(rgb_list)
or you could try to achive the same goal with a one-liner,
for i in range(len(green_list)):
rgb_list = ['%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j]) for j in range(len(green_list[i]))]
print(rgb_list)
hope it will do the trick for you

In the code you show, the slow bit is the string formatting. That we can improve somewhat.
A hex colour consists of eight bits for the red field, eight for the green, and eight for the blue (since your data does not seem to have an alpha channel, I am going to ignore that option). So we need at least twenty four bits to store the rgb colours.
You can create hex values using numpy's bitwise operators. The advantage is that this is completely vectorised. You then only have one value to format into a hex string for each (i, j), instead of three:
for i in range(len(green_list)):
hx = red_list[i] << 16 | green_list[i] << 8 | blue_list[i]
hex_list = ['%06x' % val for val in hx]
When the numpy arrays have dimensions (10, 1_000_000), this is about 5.5x faster than your original method (on my machine).

1. for-loop
Code modifications for rgb_list.append() does not affect much to the performance.
import timeit
n = 1000000
red_list = [list(range(1, n+0)), list(range(1, n+2))]
green_list = [list(range(2, n+1)), list(range(2, n+3))]
blue_list = [list(range(3, n+2)), list(range(3, n+4))]
def test_1():
for i in range(len(green_list)):
rgb_list = ['%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j]) for j in range(len(green_list[i]))]
def test_2():
for i in range(len(green_list)):
rgb_list = [None] * len(green_list[i])
for j in range(len(green_list[i])):
rgb_list[j] = '%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j])
def test_3():
for i in range(len(green_list)):
rgb_list = []
for j in range(len(green_list[i])):
rgb_list.append('%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j]))
%timeit -n 1 -r 7 test_1(): 1.31 s ± 8.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit -n 1 -r 7 test_2(): 1.33 s ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit -n 1 -r 7 test_3(): 1.39 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2. disk IO
Code modifications for disk IO also does not affect much to the performance.
n = 20000000
def test_write_each():
for i in range(len(green_list)):
rgb_list = ['%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j]) for j in range(len(green_list[i]))]
with open("test_%d" % i, "wb") as f:
pickle.dump(rgb_list, f)
def test_write_once():
rgb_list_list = [None] * len(green_list)
for i in range(len(green_list)):
rgb_list_list[i] = ['%02x%02x%02x' % (red_list[i][j], green_list[i][j], blue_list[i][j]) for j in range(len(green_list[i]))]
with open("test_all", "wb") as f:
pickle.dump(rgb_list_list, f)
%timeit -n 1 -r 3 test_write_each(): 35.2 s ± 74.6 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
%timeit -n 1 -r 3 test_write_once(): 35.4 s ± 54.4 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
Conclusion
From the benchmark result, there seems like no bottleneck to be avoided in the question code.
If the disk IO itself is the problem, I would like to suggest to run the disk-IO code only once after every other job (including the ones that are not mentioned in this question) is finished.

Related

python bytes to bit string

I have value of the type bytes that need to be converted to BIT STRING
bytes_val = (b'\x80\x00', 14)
the bytes in index zero need to be converted to bit string of length as indicated by the second element (14 in this case) and formatted as groups of 8 bits like below.
expected output => '10000000 000000'B
Another example
bytes_val2 = (b'\xff\xff\xff\xff\xf0\x00', 45) #=> '11111111 11111111 11111111 11111111 11110000 00000'B

What about some combination of formatting (below with f-string but can be done otherwise), and slicing:
def bytes2binstr(b, n=None):
s = ' '.join(f'{x:08b}' for x in b)
return s if n is None else s[:n + n // 8 + (0 if n % 8 else -1)]
If I understood correctly (I am not sure what the B at the end is supposed to mean), it passes your tests and a couple more:
func = bytes2binstr
args = (
(b'\x80\x00', None),
(b'\x80\x00', 14),
(b'\x0f\x00', 14),
(b'\xff\xff\xff\xff\xf0\x00', 16),
(b'\xff\xff\xff\xff\xf0\x00', 22),
(b'\x0f\xff\xff\xff\xf0\x00', 45),
(b'\xff\xff\xff\xff\xf0\x00', 45),
)
for arg in args:
print(arg)
print(repr(func(*arg)))
# (b'\x80\x00', None)
# '10000000 00000000'
# (b'\x80\x00', 14)
# '10000000 000000'
# (b'\x0f\x00', 14)
# '00001111 000000'
# (b'\xff\xff\xff\xff\xf0\x00', 16)
# '11111111 11111111'
# (b'\xff\xff\xff\xff\xf0\x00', 22)
# '11111111 11111111 111111'
# (b'\x0f\xff\xff\xff\xf0\x00', 45)
# '00001111 11111111 11111111 11111111 11110000 00000'
# (b'\xff\xff\xff\xff\xf0\x00', 45)
# '11111111 11111111 11111111 11111111 11110000 00000'
Explanation
we start from a bytes object
iterating through it gives us a single byte as a number
each byte is 8 bit, so decoding that will already give us the correct separation
each byte is formatted using the b binary specifier, with some additional formatting: 0 zero fill, 8 minimum length
we join (concatenate) the result of the formatting using ' ' as "separator"
finally the result is returned as is if a maximum number of bits n was not specified (set to None), otherwise the result is cropped to n + the number of spaces that were added in-between the 8-character groups.
In the solution above 8 is somewhat hard-coded.
If you want it to be a parameter, you may want to look into (possibly a variation of) #kederrac first answer using int.from_bytes().
This could look something like:
def bytes2binstr_frombytes(b, n=None, k=8):
s = '{x:0{m}b}'.format(m=len(b) * 8, x=int.from_bytes(b, byteorder='big'))[:n]
return ' '.join([s[i:i + k] for i in range(0, len(s), k)])
which gives the same output as above.
Speedwise, the int.from_bytes()-based solution is also faster:
for i in range(2, 7):
n = 10 ** i
print(n)
b = b''.join([random.randint(0, 2 ** 8 - 1).to_bytes(1, 'big') for _ in range(n)])
for func in funcs:
print(func.__name__, funcs[0](b, n * 7) == func(b, n * 7))
%timeit func(b, n * 7)
print()
# 100
# bytes2binstr True
# 10000 loops, best of 3: 33.9 µs per loop
# bytes2binstr_frombytes True
# 100000 loops, best of 3: 15.1 µs per loop
# 1000
# bytes2binstr True
# 1000 loops, best of 3: 332 µs per loop
# bytes2binstr_frombytes True
# 10000 loops, best of 3: 134 µs per loop
# 10000
# bytes2binstr True
# 100 loops, best of 3: 3.29 ms per loop
# bytes2binstr_frombytes True
# 1000 loops, best of 3: 1.33 ms per loop
# 100000
# bytes2binstr True
# 10 loops, best of 3: 37.7 ms per loop
# bytes2binstr_frombytes True
# 100 loops, best of 3: 16.7 ms per loop
# 1000000
# bytes2binstr True
# 1 loop, best of 3: 400 ms per loop
# bytes2binstr_frombytes True
# 10 loops, best of 3: 190 ms per loop

you can use:
def bytest_to_bit(by, n):
bi = "{:0{l}b}".format(int.from_bytes(by, byteorder='big'), l=len(by) * 8)[:n]
return ' '.join([bi[i:i + 8] for i in range(0, len(bi), 8)])
bytest_to_bit(b'\xff\xff\xff\xff\xf0\x00', 45)
output:
'11111111 11111111 11111111 11111111 11110000 00000'
steps:
transform your bytes to an integer using int.from_bytes
str.format method can take a binary format spec.
also, you can use a more compact form where each byte is formatted:
def bytest_to_bit(by, n):
bi = ' '.join(map('{:08b}'.format, by))
return bi[:n + len(by) - 1].rstrip()
bytest_to_bit(b'\xff\xff\xff\xff\xf0\x00', 45)

test_data = [
(b'\x80\x00', 14),
(b'\xff\xff\xff\xff\xf0\x00', 45),
]
def get_bit_string(bytes_, length) -> str:
output_chars = []
for byte in bytes_:
for _ in range(8):
if length <= 0:
return ''.join(output_chars)
output_chars.append(str(byte >> 7 & 1))
byte <<= 1
length -= 1
output_chars.append(' ')
return ''.join(output_chars)
for data in test_data:
print(get_bit_string(*data))
output:
10000000 000000
11111111 11111111 11111111 11111111 11110000 00000
explanation:
length: Start from target legnth, and decreasing to 0.
if length <= 0: return ...: If we reached target length, stop and return.
''.join(output_chars): Make string from list.
str(byte >> 7 & 1)
byte >> 7: Shift 7 bits to right(only remains MSB since byte has 8 bits.)
MSB means Most Significant Bit
(...) & 1: Bit-wise and operation. It extracts LSB.
byte <<= 1: Shift 1 bit to left for byte.
length -= 1: Decreasing length.

This is lazy version.
It neither loads nor processes the entire bytes.
This one does halt regardless of input size.
The other solutions may not!
I use collections.deque to build bit string.
from collections import deque
from itertools import chain, repeat, starmap
import os
def bit_lenght_list(n):
eights, rem = divmod(n, 8)
return chain(repeat(8, eights), (rem,))
def build_bitstring(byte, bit_length):
d = deque("0" * 8, 8)
d.extend(bin(byte)[2:])
return "".join(d)[:bit_length]
def bytes_to_bits(byte_string, bits):
return "{!r}B".format(
" ".join(starmap(build_bitstring, zip(byte_string, bit_lenght_list(bits))))
)
Test;
In [1]: bytes_ = os.urandom(int(1e9))
In [2]: timeit bytes_to_bits(bytes_, 0)
4.21 µs ± 27.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [3]: timeit bytes_to_bits(os.urandom(1), int(1e9))
6.8 µs ± 51 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [4]: bytes_ = os.urandom(6)
In [5]: bytes_
Out[5]: b'\xbf\xd5\x08\xbe$\x01'
In [6]: timeit bytes_to_bits(bytes_, 45) #'10111111 11010101 00001000 10111110 00100100 00000'B
12.3 µs ± 85 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [7]: bytes_to_bits(bytes_, 14)
Out[7]: "'10111111 110101'B"

when you say BIT you mean binary?
I would try
bytes_val = b'\\x80\\x00'
for byte in bytes_val:
value_in_binary = bin(byte)

This gives the answer without python's binary representation pre-fixed 0b:
bit_str = ' '.join(bin(i).replace('0b', '') for i in bytes_val)

This works in Python 3.x:
def to_bin(l):
val, length = l
bit_str = ''.join(bin(i).replace('0b', '') for i in val)
if len(bit_str) < length:
# pad with zeros
return '0'*(length-len(bit_str)) + bit_str
else:
# cut to size
return bit_str[:length]
bytes_val = [b'\x80\x00',14]
print(to_bin(bytes_val))
and this works in 2.x:
def to_bin(l):
val, length = l
bit_str = ''.join(bin(ord(i)).replace('0b', '') for i in val)
if len(bit_str) < length:
# pad with zeros
return '0'*(length-len(bit_str)) + bit_str
else:
# cut to size
return bit_str[:length]
bytes_val = [b'\x80\x00',14]
print(to_bin(bytes_val))
Both produce result 00000100000000

convert time string XhYmZs to seconds in python

I have a string which comes in three forms:
XhYmZs or YmZs or Zs
where, h,m,s are for hours, mins, secs and X,Y,Z are the corresponding values.
How do I efficiently convert these strings to seconds in python2.7?
I guess I can do something like:
s="XhYmZs"
if "h" in s:
hours=s.split("h")
elif "m" in s:
mins=s.split("m")[0][-1]
... but this does not seem very efficient to me :(

Split on the delimiters you're interested in, then parse each resulting element into an integer and multiply as needed:
import re
def hms(s):
l = list(map(int, re.split('[hms]', s)[:-1]))
if len(l) == 3:
return l[0]*3600 + l[1]*60 + l[2]
elif len(l) == 2:
return l[0]*60 + l[1]
else:
return l[0]
This produces a duration normalized to seconds.
>>> hms('3h4m5s')
11045
>>> 3*3600+4*60+5
11045
>>> hms('70m5s')
4205
>>> 70*60+5
4205
>>> hms('300s')
300
You can also make this one line by turning the re.split() result around and multiplying by 60 raised to an incrementing power based on the element's position in the list:
def hms2(s):
return sum(int(x)*60**i for i,x in enumerate(re.split('[hms]', s)[-2::-1]))

>>> import datetime
>>> datetime.datetime.strptime('3h4m5s', '%Hh%Mm%Ss').time()
datetime.time(3, 4, 5)
Since it varies which fields are in your strings, you may have to build a matching format string.
>>> def parse(s):
... fmt=''.join('%'+c.upper()+c for c in 'hms' if c in s)
... return datetime.datetime.strptime(s, fmt).time()
The datetime module is the standard library way to handle times.
Asking to do this "efficiently" is a bit of a fool's errand. String parsing in an interpreted language isn't fast; aim for clarity. In addition, seeming efficient isn't very meaningful; either analyze the algorithm or benchmark, otherwise it's speculation.

Do not know how efficient this is, but this is how I would do it:
import re
test_data = [
'1h2m3s',
'1m2s',
'1s',
'3s1h2m',
]
HMS_REGEX = re.compile('^(\d+)h(\d+)m(\d+)s$')
MS_REGEX = re.compile('^(\d+)m(\d+)s$')
S_REGEX = re.compile('^(\d+)s$')
def total_seconds(hms_string):
found = HMS_REGEX.match(hms_string)
if found:
x = found.group(1)
return 3600 * int(found.group(1)) + \
60 * int(found.group(2)) + \
int(found.group(3))
found = MS_REGEX.match(hms_string)
if found:
return 60 * int(found.group(1)) + int(found.group(2))
found = S_REGEX.match(hms_string)
if found:
return int(found.group(1))
raise ValueError('Could not convert ' + hms_string)
for datum in test_data:
try:
print(total_seconds(datum))
except ValueError as exc:
print(exc)
or going to a single match and riffing on TigerhawkT3's one liner, but retaining the error checking of non-matching strings:
HMS_REGEX = re.compile('^(\d+)h(\d+)m(\d+)s$|^(\d+)m(\d+)s$|^(\d+)s$')
def total_seconds(hms_string):
found = HMS_REGEX.match(hms_string)
if found:
return sum(
int(x or 0) * 60 ** i for i, x in enumerate(
(y for y in reversed(found.groups()) if y is not None))
raise ValueError('Could not convert ' + hms_string)

My fellow pythonistas, please stop using regular expression for everything. Regular Expression is not needed for such simple tasks. Python is considered a slow language not because the GIL or the interpreter, because such mis-usage.
In [1]: import re
...: def hms(s):
...: l = list(map(int, re.split('[hms]', s)[:-1]))
...: if len(l) == 3:
...: return l[0]*3600 + l[1]*60 + l[2]
...: elif len(l) == 2:
...: return l[0]*60 + l[1]
...: else:
...: return l[0]
In [2]: %timeit hms("6h7m8s")
5.62 µs ± 722 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [6]: def ehms(s):
...: bases=dict(h=3600, m=60, s=1)
...: secs = 0
...: num = 0
...: for c in s:
...: if c.isdigit():
...: num = num * 10 + int(c)
...: else:
...: secs += bases[c] * num
...: num = 0
...: return secs
In [7]: %timeit ehms("6h7m8s")
2.07 µs ± 70.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [8]: %timeit hms("8s")
2.35 µs ± 124 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [9]: %timeit ehms("8s")
1.06 µs ± 118 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
In [10]: bases=dict(h=3600, m=60, s=1)
In [15]: a = ord('a')
In [16]: def eehms(s):
...: secs = 0
...: num = 0
...: for c in s:
...: if c.isdigit():
...: num = num * 10 + ord(c) - a
...: else:
...: secs += bases[c] * num
...: num = 0
...: return secs
In [17]: %timeit eehms("6h7m8s")
1.45 µs ± 30 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
see, almost 4 times as fast.

There's a library python-dateutil - pip install python-dateutil, it takes a string and returns a datetime.datetime.
It can parse values as 5h 30m, 0.5h 30m, 0.5h - with spaces or without.
from datetime import datetime
from dateutil import parser
time = '5h15m50s'
midnight_plus_time = parser.parse(time)
midnight: datetime = datetime.combine(datetime.today(), datetime.min.time())
timedelta = midnight_plus_time - midnight
print(timedelta.seconds) # 18950
It can't parse more than 24h at once though.

How to turn a 1D radial profile into a 2D array in python

I have a list that models a phenomenon that is a function of radius. I want to convert this to a 2D array. I wrote some code that does exactly what I want, but since it uses nested for loops, it is quite slow.
l = len(profile1D)/2
critDim = int((l**2 /2.)**(1/2.))
profile2D = np.empty([critDim, critDim])
for x in xrange(0, critDim):
for y in xrange(0,critDim):
r = ((x**2 + y**2)**(1/2.))
profile2D[x,y] = profile1D[int(l+r)]
Is there a more efficient way to do the same thing by avoiding these loops?

Here's a vectorized approach using broadcasting -
a = np.arange(critDim)**2
r2D = np.sqrt(a[:,None] + a)
out = profile1D[(l+r2D).astype(int)]
If there are many repeated indices generated by l+r2D, we can use np.take for some further performance boost, like so -
out = np.take(profile1D,(l+r2D).astype(int))
Runtime test
Function definitions -
def org_app(profile1D,l,critDim):
profile2D = np.empty([critDim, critDim])
for x in xrange(0, critDim):
for y in xrange(0,critDim):
r = ((x**2 + y**2)**(1/2.))
profile2D[x,y] = profile1D[int(l+r)]
return profile2D
def vect_app1(profile1D,l,critDim):
a = np.arange(critDim)**2
r2D = np.sqrt(a[:,None] + a)
out = profile1D[(l+r2D).astype(int)]
return out
def vect_app2(profile1D,l,critDim):
a = np.arange(critDim)**2
r2D = np.sqrt(a[:,None] + a)
out = np.take(profile1D,(l+r2D).astype(int))
return out
Timings and verification -
In [25]: # Setup input array and params
...: profile1D = np.random.randint(0,9,(1000))
...: l = len(profile1D)/2
...: critDim = int((l**2 /2.)**(1/2.))
...:
In [26]: np.allclose(org_app(profile1D,l,critDim),vect_app1(profile1D,l,critDim))
Out[26]: True
In [27]: np.allclose(org_app(profile1D,l,critDim),vect_app2(profile1D,l,critDim))
Out[27]: True
In [28]: %timeit org_app(profile1D,l,critDim)
10 loops, best of 3: 154 ms per loop
In [29]: %timeit vect_app1(profile1D,l,critDim)
1000 loops, best of 3: 1.69 ms per loop
In [30]: %timeit vect_app2(profile1D,l,critDim)
1000 loops, best of 3: 1.68 ms per loop
In [31]: # Setup input array and params
...: profile1D = np.random.randint(0,9,(5000))
...: l = len(profile1D)/2
...: critDim = int((l**2 /2.)**(1/2.))
...:
In [32]: %timeit org_app(profile1D,l,critDim)
1 loops, best of 3: 3.76 s per loop
In [33]: %timeit vect_app1(profile1D,l,critDim)
10 loops, best of 3: 59.8 ms per loop
In [34]: %timeit vect_app2(profile1D,l,critDim)
10 loops, best of 3: 59.5 ms per loop

Faster loop operating on two values of an array

Consider the following function:
def dostuff(n, f):
array = numpy.arange(0, n)
for i in range(1, n): # Line 1
array[i] = f(array[i-1], array[i]) # Line 2
return numpy.sum(array)
How can I rewrite the Line 1/Line 2 to make the loop faster in python 3 (without using cython)?

I encourage you to check this question on SO generalized cumulative functions in NumPy/SciPy? , since you want a generalized cumulative function .
also check scipy documentation for the function frompyfunc Here
func = np.frompyfunc(f , 2 , 1)
def dostuff(n,f):
final_array = func.accumulate(np.arange(0,n), dtype=np.object).astype(np.int)
return np.sum(final_array)
Example
In [86]:
def f(num1 , num2):
return num1 + num2
In [87]:
func = np.frompyfunc(f , 2 , 1)
In [88]:
def dostuff(n,f):
final_array = func.accumulate(np.arange(0,n), dtype=np.object).astype(np.int)
return np.sum(final_array)
In [108]:
dostuff(15,f)
Out[108]:
560
In [109]:
dostuff(10,f)
Out[109]:
165
Benchmarks
def dostuff1(n, f):
array = np.arange(0, n)
for i in range(1, n): # Line 1
array[i] = f(array[i-1], array[i]) # Line 2
return np.sum(array)
def dostuff2(n,f):
final_array = func.accumulate(np.arange(0,n), dtype=np.object).astype(np.int)
return np.sum(final_array)
In [126]:
%timeit dostuff1(100,f)
10000 loops, best of 3: 40.6 µs per loop
In [127]:
%timeit dostuff2(100,f)
The slowest run took 4.98 times longer than the fastest. This could mean that an intermediate result is being cached
10000 loops, best of 3: 23.8 µs per loop

Iterate over a ‘window’ of adjacent elements in Python

This is more a question of elegance and performance rather than “how to do at all”, so I'll just show the code:
def iterate_adjacencies(gen, fill=0, size=2, do_fill_left=True,
do_fill_right=False):
""" Iterates over a 'window' of `size` adjacent elements in the supploed
`gen` generator, using `fill` to fill edge if `do_fill_left` is True
(default), and fill the right edge (i.e. last element and `size-1` of
`fill` elements as the last item) if `do_fill_right` is True. """
fill_size = size - 1
prev = [fill] * fill_size
i = 1
for item in gen: # iterate over the supplied `whatever`.
if not do_fill_left and i < size:
i += 1
else:
yield prev + [item]
prev = prev[1:] + [item]
if do_fill_right:
for i in range(fill_size):
yield prev + [fill]
prev = prev[1:] + [fill]
and then ask: is there already a function for that? And, if not, can you do the same thing in a better (i.e. more neat and/or more fast) way?
Edit:
with ideas from answers of #agf, #FogleBird, #senderle, a resulting somewhat-neat-looking piece of code is:
def window(seq, size=2, fill=0, fill_left=True, fill_right=False):
""" Returns a sliding window (of width n) over data from the iterable:
s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
"""
ssize = size - 1
it = chain(
repeat(fill, ssize * fill_left),
iter(seq),
repeat(fill, ssize * fill_right))
result = tuple(islice(it, size))
if len(result) == size: # `<=` if okay to return seq if len(seq) < size
yield result
for elem in it:
result = result[1:] + (elem,)
yield result

This page shows how to implement a sliding window with itertools. http://docs.python.org/release/2.3.5/lib/itertools-example.html
def window(seq, n=2):
"Returns a sliding window (of width n) over data from the iterable"
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
Example output:
>>> list(window(range(10)))
[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9)]
You'd need to change it to fill left and right if you need.

This is my version that fills, keeping the signature the same. I have previously seen the itertools recipe, but did not look at it before writing this.
from itertools import chain
from collections import deque
def ia(gen, fill=0, size=2, fill_left=True, fill_right=False):
gen, ssize = iter(gen), size - 1
deq = deque(chain([fill] * ssize * fill_left,
(next(gen) for _ in xrange((not fill_left) * ssize))),
maxlen = size)
for item in chain(gen, [fill] * ssize * fill_right):
deq.append(item)
yield deq
Edit: I also didn't see your comments on your question before posting this.
Edit 2: Fixed. I had tried to do it with one chain but this design needs two.
Edit 3: As #senderle noted, only use it this as a generator, don't wrap it with list or accumulate the output, as it yields the same mutable item repeatedly.

Ok, after coming to my senses, here's a non-ridiculous version of window_iter_fill. My previous version (visible in edits) was terrible because I forgot to use izip. Not sure what I was thinking. Using izip, this works, and, in fact, is the fastest option for small inputs!
def window_iter_fill(gen, size=2, fill=None):
gens = (chain(repeat(fill, size - i - 1), gen, repeat(fill, i))
for i, gen in enumerate(tee(gen, size)))
return izip(*gens)
This one is also fine for tuple-yielding, but not quite as fast.
def window_iter_deque(it, size=2, fill=None, fill_left=False, fill_right=False):
lfill = repeat(fill, size - 1 if fill_left else 0)
rfill = repeat(fill, size - 1 if fill_right else 0)
it = chain(lfill, it, rfill)
d = deque(islice(it, 0, size - 1), maxlen=size)
for item in it:
d.append(item)
yield tuple(d)
HoverHell's newest solution is still the best tuple-yielding solution for high inputs.
Some timings:
Arguments: [xrange(1000), 5, 'x', True, True]
==============================================================================
window HoverHell's frankeniter : 0.2670ms [1.91x]
window_itertools from old itertools docs : 0.2811ms [2.02x]
window_iter_fill extended `pairwise` with izip : 0.1394ms [1.00x]
window_iter_deque deque-based, copying : 0.4910ms [3.52x]
ia_with_copy deque-based, copying v2 : 0.4892ms [3.51x]
ia deque-based, no copy : 0.2224ms [1.60x]
==============================================================================
Scaling behavior:
Arguments: [xrange(10000), 50, 'x', True, True]
==============================================================================
window HoverHell's frankeniter : 9.4897ms [4.61x]
window_itertools from old itertools docs : 9.4406ms [4.59x]
window_iter_fill extended `pairwise` with izip : 11.5223ms [5.60x]
window_iter_deque deque-based, copying : 12.7657ms [6.21x]
ia_with_copy deque-based, copying v2 : 13.0213ms [6.33x]
ia deque-based, no copy : 2.0566ms [1.00x]
==============================================================================
The deque-yielding solution by agf is super fast for large inputs -- seemingly O(n) instead of O(n, m) like the others, where n is the length of the iter and m is the size of the window -- because it doesn't have to iterate over every window. But I still think it makes more sense to yield a tuple in the general case, because the calling function is probably just going to iterate over the deque anyway; it's just a shift of the computational burden. The asymptotic behavior of the larger program should remain the same.
Still, in some special cases, the deque-yielding version will probably be faster.
Some more timings based on HoverHell's test structure.
>>> import testmodule
>>> kwa = dict(gen=xrange(1000), size=4, fill=-1, fill_left=True, fill_right=True)
>>> %timeit -n 1000 [a + b + c + d for a, b, c, d in testmodule.window(**kwa)]
1000 loops, best of 3: 462 us per loop
>>> %timeit -n 1000 [a + b + c + d for a, b, c, d in testmodule.ia(**kwa)]
1000 loops, best of 3: 463 us per loop
>>> %timeit -n 1000 [a + b + c + d for a, b, c, d in testmodule.window_iter_fill(**kwa)]
1000 loops, best of 3: 251 us per loop
>>> %timeit -n 1000 [sum(x) for x in testmodule.window(**kwa)]
1000 loops, best of 3: 525 us per loop
>>> %timeit -n 1000 [sum(x) for x in testmodule.ia(**kwa)]
1000 loops, best of 3: 462 us per loop
>>> %timeit -n 1000 [sum(x) for x in testmodule.window_iter_fill(**kwa)]
1000 loops, best of 3: 333 us per loop
Overall, once you use izip, window_iter_fill is quite fast, as it turns out -- especially for small windows.

Resulting function (from the edit of the question),
frankeniter with ideas from answers of #agf, #FogleBird, #senderle, a resulting somewhat-neat-looking piece of code is:
from itertools import chain, repeat, islice
def window(seq, size=2, fill=0, fill_left=True, fill_right=False):
""" Returns a sliding window (of width n) over data from the iterable:
s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
"""
ssize = size - 1
it = chain(
repeat(fill, ssize * fill_left),
iter(seq),
repeat(fill, ssize * fill_right))
result = tuple(islice(it, size))
if len(result) == size: # `<=` if okay to return seq if len(seq) < size
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
and, for some performance information regarding deque/tuple:
In [32]: kwa = dict(gen=xrange(1000), size=4, fill=-1, fill_left=True, fill_right=True)
In [33]: %timeit -n 10000 [a+b+c+d for a,b,c,d in tmpf5.ia(**kwa)]
10000 loops, best of 3: 358 us per loop
In [34]: %timeit -n 10000 [a+b+c+d for a,b,c,d in tmpf5.window(**kwa)]
10000 loops, best of 3: 368 us per loop
In [36]: %timeit -n 10000 [sum(x) for x in tmpf5.ia(**kwa)]
10000 loops, best of 3: 340 us per loop
In [37]: %timeit -n 10000 [sum(x) for x in tmpf5.window(**kwa)]
10000 loops, best of 3: 432 us per loop
but anyway, if it's numbers then numpy is likely preferable.

I'm surprised nobody took a simple coroutine approach.
from collections import deque
def window(n, initial_data=None):
if initial_data:
win = deque(initial_data, n)
else:
win = deque(((yield) for _ in range(n)), n)
while 1:
side, val = (yield win)
if side == 'left':
win.appendleft(val)
else:
win.append(val)
win = window(4)
win.next()
print(win.send(('left', 1)))
print(win.send(('left', 2)))
print(win.send(('left', 3)))
print(win.send(('left', 4)))
print(win.send(('right', 5)))
## -- Results of print statements --
deque([1, None, None, None], maxlen=4)
deque([2, 1, None, None], maxlen=4)
deque([3, 2, 1, None], maxlen=4)
deque([4, 3, 2, 1], maxlen=4)
deque([3, 2, 1, 5], maxlen=4)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Optimization of numpy array iteration - python

Related

python bytes to bit string

convert time string XhYmZs to seconds in python

How to turn a 1D radial profile into a 2D array in python

Faster loop operating on two values of an array

Iterate over a ‘window’ of adjacent elements in Python

Categories

Resources