numba #njit to update a big dict - python

I try to use numba for a function that need to do a search on a very big (10e6) dict with (int, int) tuple as key.
import numpy as np
from numba import njit
myarray = np.array([[0, 0], # 0, 1
[0, 1],
[1, 1], # 1, 2
[1, 2], # 1, 3
[2, 2],
[1, 3]]
) # a lot of this with shape~(10e6, 2)
dict_with_tuples_key = {(0, 1): 1,
(3, 7): 1} # ~10e6 keys
A simplified version look like this
# #njit
def update_dict(dict_with_tuples_key, myarray):
for line in myarray:
i, j = line
if (i, j) in dict_with_tuples_key:
dict_with_tuples_key[(i, j)] += 1
else:
dict_with_tuples_key[(i, j)] = 1
return dict_with_tuples_key
new_dict = update_dict(dict_with_tuples_key, myarray)
print new_dict
new_dict = update_dict2(dict_with_tuples_key, myarray)
# print new_dict
# {(0, 1): 2, # +1 already in dict_with_tuples_key
# (0, 0): 1, # diag
# (1, 1): 1, # diag
# (2, 2): 1, # diag
# (1, 2): 1, # new from myarray
# (1, 3): 1, # new from myarray
# (3, 7): 1 }
It would appear that #njit does not accept dict as function arg ?
I'm wondering how to rewrite this, specially the if (i, j) in dict_with_tuples_key part that do the search.

njit means that the function is compiled in nopython mode. A dict, list and tuple are python objects and therefore not supported. Not as arguments and not inside the function.
If your dict keys are all different I would consider using a 2D numpy array where the first axis represents the first index of the dict-key-tuple and the second axis the second index. Then you could rewrite it as:
from numba import njit
import numpy as np
#njit
def update_array(array, myarray):
elements = myarray.shape[0]
for i in range(elements):
array[myarray[i][0]][myarray[i][1]] += 1
return array
myarray = np.array([[0, 0], [0, 1], [1, 1],
[1, 2], [2, 2], [1, 3]])
# Calculate the size of the numpy array that replaces the dict:
lens = np.max(myarray, axis=0) # Maximum values
array = np.zeros((lens[0]+1, lens[1]+1)) # Create an empty array to hold all indexes in myarray
update_array(array, myarray)
Since you already indexed your dictionary with tuples the transition problems to indexing an array will not be great.

As an alternative you can try if this fast enough:
from collections import Counter
c2 = Counter(dict_with_tuples_key)
c1 = Counter(tuple(x) for x in myarray)
new_dict = dict(c1 + c2)

Related

Finding indices of first non-zero items in a list

I have the following list :
list_test = [0,0,0,1,0,2,5,4,0,0,5,5,3,0,0]
I would like to find the indices of all the first numbers in the list that are not equal to zero.
In this case the output should be:
output = [3,5,10]
Is there a Pythonic way to do this?
According to the output, I think you want the first index of continuous non-zero sequences.
As for Pythonic, I understand it as list generator, while it's poorly readable.
# works with starting with non-zero element.
# list_test = [1, 0, 0, 1, 0, 2, 5, 4, 0, 0, 5, 5, 3, 0, 0]
list_test = [0, 0, 0, 1, 0, 2, 5, 4, 0, 0, 5, 5, 3, 0, 0]
output = [i for i in range(len(list_test)) if list_test[i] != 0 and (i == 0 or list_test[i - 1] == 0)]
print(output)
There is also a numpy based solution:
import numpy as np
l = np.array([0,0,0,1,0,2,5,4,0,0,5,5,3,0,0])
non_zeros = np.where(l != 0)[0]
diff = np.diff(non_zeros)
np.append(non_zeros [0], non_zeros [1 + np.where(diff>=2)[0]]) # array([ 3, 5, 10], dtype=int64)
Explanation:
First, we find the non-zero places, then we calculate the pair differences of those position (we need to add 1 because its out[i] = a[i+1] - a[i], read more about np.diff) then we need to add the first element of non-zero and also all the values where the difference was greater then 1)
Note:
It will also work for the case where the array start with non-zero element or all non-zeros.
From the Link.
l = [0,0,0,1,0,2,5,4,0,0,5,5,3,0,0]
v = {}
for i, x in enumerate(l):
if x != 0 and x not in v:
v[x] = i
list_test = [0,0,0,1,0,2,5,4,0,0,5,5,3,0,0]
res = {}
for index, item in enumerate(list_test):
if item > 0:
res.setdefault(index, None)
print(res.keys())
I don't knwo what you mean by Pythonic way, but this is an answer using a simple loop:
list_test = [0,0,0,1,0,2,5,4,0,0,5,5,3,0,0]
out = []
if list_test[0] == 0:
out.append(0)
for i in range(1, len(list_test)):
if (list_test[i-1] == 0) and (list_test[i] != 0):
out.append(i)
Don't hesitate to precise what you mean by "Pythonic" !

How to merge an array with its array elements in Python?

I have an array like below;
constants = ['(1,2)', '(1,5,1)', '1']
I would like to transform the array into like below;
constants = [(1,2), 1, 2, 3, 4, 5, 1]
For doing this, i tried some operations;
from ast import literal_eval
import numpy as np
constants = literal_eval(str(constants).replace("'",""))
constants = [(np.arange(*i) if len(i)==3 else i) if isinstance(i, tuple) else i for i in constants]
And the output was;
constants = [(1, 2), array([1, 2, 3, 4]), 1]
So, this is not expected result and I'm stuck in this step. The question is, how can i merge the array with its parent array?
This is one approach.
Demo:
from ast import literal_eval
constants = ['(1,2)', '(1,5,1)', '1']
res = []
for i in constants:
val = literal_eval(i) #Convert to python object
if isinstance(val, tuple): #Check if element is tuple
if len(val) == 3: #Check if no of elements in tuple == 3
val = list(val)
val[1]+=1
res.extend(range(*val))
continue
res.append(val)
print(res)
Output:
[(1, 2), 1, 2, 3, 4, 5, 1]
I'm going to assume that this question is very literal, and that you always want to transform this:
constants = ['(a, b)', '(x, y, z)', 'i']
into this:
transformed = [(a,b), x, x+z, x+2*z, ..., y, i]
such that the second tuple is a range from x to y with step z. So your final transformed array is the first element, then the range defined by your second element, and then your last element. The easiest way to do this is simply step-by-step:
constants = ['(a, b)', '(x, y, z)', 'i']
literals = [eval(k) for k in constants] # get rid of the strings
part1 = [literals[0]] # individually make each of the three parts of your list
part2 = [k for k in range(literals[1][0], literals[1][1] + 1, literals[1][2])] # or if you don't need to include y then you could just do range(literals[1])
part3 = [literals[2]]
transformed = part1 + part2 + part3
I propose the following:
res = []
for cst in constants:
if isinstance(cst,tuple) and (len(cst) == 3):
#add the range to the list
res.extend(range(cst[0],cst[1], cst[2]))
else:
res.append(cst)
res has the result you want.
There may be a more elegant way to solve it.
Please use code below to resolve parsing described above:
from ast import literal_eval
constants = ['(1,2)', '(1,5,1)', '1']
processed = []
for index, c in enumerate(constants):
parsed = literal_eval(c)
if isinstance(parsed, (tuple, list)) and index != 0:
processed.extend(range(1, max(parsed) + 1))
else:
processed.append(parsed)
print processed # [(1, 2), 1, 2, 3, 4, 5, 1]

Finding longest run in a list

Given a list of data, I'm trying to create a new list in which the value at position i is the length of the longest run starting from position i in the original list. For instance, given
x_list = [1, 1, 2, 3, 3, 3]
Should return:
run_list = [2, 1, 1, 3, 2, 1]
My solution:
freq_list = []
current = x_list[0]
count = 0
for num in x_list:
if num == current:
count += 1
else:
freq_list.append((current,count))
current = num
count = 1
freq_list.append((current,count))
run_list = []
for i in freq_list:
z = i[1]
while z > 0:
run_list.append(z)
z -= 1
Firstly I create a list freq_list of tuples, where every tuple's first element is the element from x_list, and where the second element is the number of the total run.
In this case:
freq_list = [(1, 2), (2, 1), (3, 3)]
Having this, I create a new list and append appropriate values.
However, I was wondering if there is a shorter way/another way to do this?
Here's a simple solution that iterates over the list backwards and increments a counter each time a number is repeated:
last_num = None
result = []
for num in reversed(x_list):
if num != last_num:
# if the number changed, reset the counter to 1
counter = 1
last_num = num
else:
# if the number is the same, increment the counter
counter += 1
result.append(counter)
# reverse the result
result = list(reversed(result))
Result:
[2, 1, 1, 3, 2, 1]
This is possible using itertools:
from itertools import groupby, chain
x_list = [1, 1, 2, 3, 3, 3]
gen = (range(len(list(j)), 0, -1) for _, j in groupby(x_list))
res = list(chain.from_iterable(gen))
Result
[2, 1, 1, 3, 2, 1]
Explanation
First use itertools.groupby to group identical items in your list.
For each item in your groupby, create a range object which counts backwards from the length of the number of consecutive items to 1.
Turn this all into a generator to avoid building a list of lists.
Use itertools.chain to chain the ranges from the generator.
Performance note
Performance will be inferior to #Aran-Fey's solution. Although itertools.groupby is O(n), it makes heavy use of expensive __next__ calls. These do not scale as well as iteration in simple for loops. See itertools docs for groupby pseudo-code.
If performance is your main concern, stick with the for loop.
You are performing a reverse cumulative count on contiguous groups. We can create a Numpy cumulative count function with
import numpy as np
def cumcount(a):
a = np.asarray(a)
b = np.append(False, a[:-1] != a[1:])
c = b.cumsum()
r = np.arange(len(a))
return r - np.append(0, np.flatnonzero(b))[c] + 1
and then generate our result with
a = np.array(x_list)
cumcount(a[::-1])[::-1]
array([2, 1, 1, 3, 2, 1])
I would use a generator for this kind of task because it avoids building the resulting list incrementally and can be used lazily if one wanted:
def gen(iterable): # you have to think about a better name :-)
iterable = iter(iterable)
# Get the first element, in case that fails
# we can stop right now.
try:
last_seen = next(iterable)
except StopIteration:
return
count = 1
# Go through the remaining items
for item in iterable:
if item == last_seen:
count += 1
else:
# The consecutive run finished, return the
# desired values for the run and then reset
# counter and the new item for the next run.
yield from range(count, 0, -1)
count = 1
last_seen = item
# Return the result for the last run
yield from range(count, 0, -1)
This will also work if the input cannot be reversed (certain generators/iterators cannot be reversed):
>>> x_list = (i for i in range(10)) # it's a generator despite the variable name :-)
>>> ... arans solution ...
TypeError: 'generator' object is not reversible
>>> list(gen((i for i in range(10))))
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
And it works for your input:
>>> x_list = [1, 1, 2, 3, 3, 3]
>>> list(gen(x_list))
[2, 1, 1, 3, 2, 1]
This can actually be made simpler by using itertools.groupby:
import itertools
def gen(iterable):
for _, group in itertools.groupby(iterable):
length = sum(1 for _ in group) # or len(list(group))
yield from range(length, 0, -1)
>>> x_list = [1, 1, 2, 3, 3, 3]
>>> list(gen(x_list))
[2, 1, 1, 3, 2, 1]
I also did some benchmarks and according to these Aran-Feys solution is the fastest except for long lists where piRSquareds solution wins:
This was my benchmarking setup if you want to confirm the results:
from itertools import groupby, chain
import numpy as np
def gen1(iterable):
iterable = iter(iterable)
try:
last_seen = next(iterable)
except StopIteration:
return
count = 1
for item in iterable:
if item == last_seen:
count += 1
else:
yield from range(count, 0, -1)
count = 1
last_seen = item
yield from range(count, 0, -1)
def gen2(iterable):
for _, group in groupby(iterable):
length = sum(1 for _ in group)
yield from range(length, 0, -1)
def mseifert1(iterable):
return list(gen1(iterable))
def mseifert2(iterable):
return list(gen2(iterable))
def aran(x_list):
last_num = None
result = []
for num in reversed(x_list):
if num != last_num:
counter = 1
last_num = num
else:
counter += 1
result.append(counter)
return list(reversed(result))
def jpp(x_list):
gen = (range(len(list(j)), 0, -1) for _, j in groupby(x_list))
res = list(chain.from_iterable(gen))
return res
def cumcount(a):
a = np.asarray(a)
b = np.append(False, a[:-1] != a[1:])
c = b.cumsum()
r = np.arange(len(a))
return r - np.append(0, np.flatnonzero(b))[c] + 1
def pirsquared(x_list):
a = np.array(x_list)
return cumcount(a[::-1])[::-1]
from simple_benchmark import benchmark
import random
funcs = [mseifert1, mseifert2, aran, jpp, pirsquared]
args = {2**i: [random.randint(0, 5) for _ in range(2**i)] for i in range(1, 20)}
bench = benchmark(funcs, args, "list size")
%matplotlib notebook
bench.plot()
Python 3.6.5, NumPy 1.14
Here's a simple iterative approach to achieve it using collections.Counter:
from collections import Counter
x_list = [1, 1, 2, 3, 3, 3]
x_counter, run_list = Counter(x_list), []
for x in x_list:
run_list.append(x_counter[x])
x_counter[x] -= 1
which will return you run_list as:
[2, 1, 1, 3, 2, 1]
As an alternative, here's one-liner to achieve this using list comprehension with enumerate but it is not performance efficient due to iterative usage of list.index(..):
>>> [x_list[i:].count(x) for i, x in enumerate(x_list)]
[2, 1, 1, 3, 2, 1]
You can count the consecutive equal items and then add a countdown from count-of-items to 1 to the result:
def runs(p):
old = p[0]
n = 0
q = []
for x in p:
if x == old:
n += 1
else:
q.extend(range(n, 0, -1))
n = 1
old = x
q.extend(range(n, 0, -1))
return q
(A couple of minutes later) Oh, that's the same as MSeifert's code but without the iterable aspect. This version seems to be almost as fast as the method shown by Aran-Fey.

Python random list

I'm new to Python, and have some problems with creating random lists.
I'm using random.sample(range(x, x), y).
I want to get 4 lists with unique numbers, from 1-4, so I have been using this
a = random.sample(range(1, 5), 4)
b = random.sample(range(1, 5), 4)
c = random.sample(range(1, 5), 4)
d = random.sample(range(1, 5), 4)
So I get for example
a = 1, 3, 2, 4
b = 1, 4, 3, 2
c = 2, 3, 1, 4
d = 4, 2, 3, 1
How can I make it that the column are also unique?
Absent a clear mathematical theory, I distrust anything other than a somewhat hit-and-miss approach. In particular, backtracking approaches can introduce a subtle bias:
from random import shuffle
def isLatin(square):
#assumes that square is an nxn list
#where each row is a permutation of 1..n
n = len(square[0])
return all(len(set(col)) == n for col in zip(*square))
def randSquare(n):
row = [i for i in range(1,1+n)]
square = []
for i in range(n):
shuffle(row)
square.append(row[:])
return square
def randLatin(n):
#uses a hit and miss approach
while True:
square = randSquare(n)
if isLatin(square): return square
Typical output:
>>> s = randLatin(4)
>>> for r in s: print(r)
[4, 1, 3, 2]
[2, 3, 4, 1]
[1, 4, 2, 3]
[3, 2, 1, 4]
Totally random then:
def gen_matrix():
first_row = random.sample(range(1, 5), 4)
tmp = first_row + first_row
rows = []
for i in range(4):
rows.append(tmp[i:i+4])
return random.sample(rows, 4)
Create a list of all the elements, and as will filling the line, remove the used element.
import random
def fill_line(length):
my_list = list(range(length))
to_return = []
for i in range(length):
x = random.choice(my_list)
to_return.append(x)
my_list.remove(x)
return to_return
x = [fill_line(4)
for i in range(4)]
print(x)
Probably the simplest way is to create a valid matrix, and then shuffle the rows, and then shuffle the columns:
import random
def random_square(U):
U = list(U)
rows = [U[i:] + U[:i] for i in range(len(U))]
random.shuffle(rows)
rows_t = [list(i) for i in zip(*rows)]
random.shuffle(rows_t)
return rows_t
Usage:
>>> random_square(range(1, 1+4))
[[2, 3, 4, 1], [4, 1, 2, 3], [3, 4, 1, 2], [1, 2, 3, 4]]
This should be able to create any valid matrix with equal probability. After doing some reading it seems that this still has bias, although I don't fully comprehend why yet.
I would build a random latin square by 1) start with a single random permutation, 2) populate the rows with rotations 3) shuffle the rows 4) transpose the square 5) shuffle the rows again:
from collections import deque
from random import shuffle
def random_latin_square(elements):
elements = list(elements)
shuffle(elements)
square = []
for i in range(len(elements)):
square.append(list(elements))
elements = elements[1:] + [elements[0]]
shuffle(square)
square[:] = zip(*square)
shuffle(square)
return square
if __name__ == '__main__':
from pprint import pprint
square = random_latin_square('ABCD')
pprint(square)

Sorting in Sparse Matrix

I have a sparse matrix. I need to sort this matrix row-by-row and create another [sparse] matrix.
Code may explain it better:
# for `rand` function, you need newer version of scipy.
from scipy.sparse import *
m = rand(6,6, density=0.6)
d = m.getrow(0)
print d
Output1
(0, 5) 0.874881629788
(0, 4) 0.352559852239
(0, 2) 0.504791645463
(0, 1) 0.885898140175
I have this m matrix. I want to create a new matrix with sorted version of m. The new matrix
contains 0'th row like this.
new_d = new_m.getrow(0)
print new_d
Output2
(0, 1) 0.885898140175
(0, 5) 0.874881629788
(0, 2) 0.504791645463
(0, 4) 0.352559852239
So I can obtain which column is bigger etc:
print new_d.indices
Output3
array([1, 5, 2, 4])
Of course every row should be sorted like above independently.
I have one solution for this problem but it is not elegant.
If you're willing to ignore the zero-value elements of the matrix, the code below should work. It is also much faster than implementations that use the getrow method, which is rather slow.
from itertools import izip
def sort_coo(m):
tuples = izip(m.row, m.col, m.data)
return sorted(tuples, key=lambda x: (x[0], x[2]))
For example:
>>> from numpy.random import rand
>>> from scipy.sparse import coo_matrix
>>>
>>> d = rand(10, 20)
>>> d[d > .05] = 0
>>> s = coo_matrix(d)
>>> sort_coo(s)
[(0, 2, 0.004775589084940246),
(3, 12, 0.029941507166614145),
(5, 19, 0.015030386789436245),
(7, 0, 0.0075044957259399192),
(8, 3, 0.047994403933129481),
(8, 5, 0.049401058471327031),
(9, 15, 0.040011608000125043),
(9, 8, 0.048541825332137023)]
Depending on your needs you may want to tweak the sort keys in the lambda or further process the output. If you want everything in a row indexed dictionary you could do:
from collections import defaultdict
sorted_rows = defaultdict(list)
for i in sort_coo(m):
sorted_rows[i[0]].append((i[1], i[2]))
My bad solution is like this:
from scipy.sparse import coo_matrix
import numpy as np
a = []
for i in xrange(m.shape[0]): # assume m is square matrix.
d = m.getrow(i)
n = len(d.indices)
s = zip([i]*n, d.indices, d.data)
sorted_s = sorted(s, key=lambda v: v[2], reverse=True)
a.extend(sorted_s)
a = np.array(a)
new_m = coo_matrix((a[:,2], (a[:,0], a[:,1])), m.shape)
There can be some simple mistakes above because I have not checked it yet. But the idea is intuitive, I guess. Is there any good solution?
Edit
This new matrix creation may be useless because if you call getrow method then the order is broken again.
Only coo_matrix.col keeps the order.
Another Solution
This one is not exact solution but it may be helpful:
def sortSparseMatrix(m, rev=True, only_indices=True):
""" Sort a sparse matrix and return column index dictionary
"""
col_dict = dict()
for i in xrange(m.shape[0]): # assume m is square matrix.
d = m.getrow(i)
s = zip(d.indices, d.data)
sorted_s = sorted(s, key=lambda v: v[1], reverse=True)
if only_indices:
col_dict[i] = [element[0] for element in sorted_s]
else:
col_dict[i] = sorted_s
return col_dict
>>> print sortSparseMatrix(m)
{0: [5, 1, 0],
1: [1, 3, 5],
2: [1, 2, 3, 4],
3: [1, 5, 2, 4],
4: [0, 3, 5, 1],
5: [3, 4, 2]}

Categories

Resources