Counting repeated sequences in transition table

Counting repeated sequences in transition table - python

I'm using the following function to generate a transition table:
import numpy as np
import pandas as pd
def make_table(allSeq):
n = max([ max(s) for s in allSeq ]) + 1
arr = np.zeros((n,n), dtype=int)
for seq in allSeq:
ind = (seq[1:], seq[:-1])
arr[ind] += 1
return pd.DataFrame(arr).rename_axis(index='Next', columns='Current')
However, my result is incorrect:
list1 = [1,2,3,4,5,4,5,4,5]
list2 = [4,5,4,5]
make_table([list1, list2])
Current 0 1 2 3 4 5
Next
0 0 0 0 0 0 0
1 0 0 0 0 0 0
2 0 1 0 0 0 0
3 0 0 1 0 0 0
4 0 0 0 1 0 2
5 0 0 0 0 2 0
For example, the transition 4->5 should be counted 5 times, but it's only counted once per sequence (2). I know the issue is the arr[ind] += 1 line, but I just can't figure it out! Do I nest another loop, or is there a slick way to add the total number of instances at once? Thanks!

Figured it out! Switched to the following:
def make_table(allSeq):
n = max([ max(s) for s in allSeq ]) + 1
arr = np.zeros((n,n), dtype=int)
for seq in allSeq:
for i,j in zip(seq[1:],seq[:-1]):
ind = (i,j)
arr[ind] += 1
return pd.DataFrame(arr).rename_axis(index='Next', columns='Current')

Another loop seems like the easiest solution, with a bit of a twist of using zip:
import numpy as np
import pandas as pd
def make_table(allSeq):
n = max([ max(s) for s in allSeq ]) + 1
arr = np.zeros((n,n), dtype=int)
for seq in allSeq:
ind = zip(seq[1:], seq[:-1])
for i in ind:
arr[i] += 1
return pd.DataFrame(arr).rename_axis(index='Next', columns='Current')
list1 = [1,2,3,4,5,4,5,4,5]
list2 = [4,5,4,5]
make_table([list1, list2])
returns
Next 0 1 2 3 4 5
------ --- --- --- --- --- ---
0 0 0 0 0 0 0
1 0 0 0 0 0 0
2 0 1 0 0 0 0
3 0 0 1 0 0 0
4 0 0 0 1 0 3
5 0 0 0 0 5 0

Related

How to accelerate numpy.unique and provide both counts and duplicate row indices

I am attempting to find duplicate rows in a numpy array. The following code replicates the structure of my array which has n rows, m columns, and nz non-zero entries per row:
import numpy as np
import random
import datetime
def create_mat(n, m, nz):
sample_mat = np.zeros((n, m), dtype='uint8')
random.seed(42)
for row in range(0, n):
counter = 0
while counter < nz:
random_col = random.randrange(0, m-1, 1)
if sample_mat[row, random_col] == 0:
sample_mat[row, random_col] = 1
counter += 1
test = np.all(np.sum(sample_mat, axis=1) == nz)
print(f'All rows have {nz} elements: {test}')
return sample_mat
The code I am attempting to optimize is as follows:
if __name__ == '__main__':
threshold = 2
mat = create_mat(1800000, 108, 8)
print(f'Time: {datetime.datetime.now()}')
unique_rows, _, duplicate_counts = np.unique(mat, axis=0, return_counts=True, return_index=True)
duplicate_indices = [int(x) for x in np.argwhere(duplicate_counts >= threshold)]
print(f'Time: {datetime.datetime.now()}')
print(f'Unique rows: {len(unique_rows)} Sample inds: {duplicate_indices[0:5]} Sample counts: {duplicate_counts[0:5]}')
print(f'Sample rows:')
print(unique_rows[0:5])
My output is as follows:
All rows have 8 elements: True
Time: 2022-06-29 12:08:07.320834
Time: 2022-06-29 12:08:23.281633
Unique rows: 1799994 Sample inds: [508991, 553136, 930379, 1128637, 1290356] Sample counts: [1 1 1 1 1]
Sample rows:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0]]
I have considered using numba, but the challenge is that it does not operate using an axis parameter. Similarly, conversion to list and utilization of sets is an option, but then looping through to perform the duplicate counts seems "unpythonic".
Given that I need to run this code multiple times (since I am modifying the numpy array and then needing to re-search for duplicates), the time is critical. I have also tried to use multiprocessing against this step but the np.unique seems to be blocking (i.e. even when I try to run multiple versions of this, I end up constrained to one thread running at 6% CPU capacity while the other threads sit idle).

Step 1: bit packing
Since your matrix only contains binary values, you can aggressively pack the bits into uint64 values so to perform a much more efficient sort then. Here is a Numba implementation:
import numpy as np
import numba as nb
#nb.njit('(uint8[:,::1],)', parallel=True)
def pack_bits(mat):
n, m = mat.shape
res = np.zeros((n, (m+63)//64), np.uint64)
for i in nb.prange(n):
for bj in range(0, m, 64):
val = np.uint64(0)
if bj + 64 <= m:
# Fast case
for j in range(64):
val += np.uint64(mat[i, bj+j]) << (63 - j)
else:
# Slow case (boundary)
for j in range(m - bj):
val += np.uint64(mat[i, bj+j]) << (63 - j)
res[i, bj//64] = val
return res
#nb.njit('(uint64[:,::1], int_)', parallel=True)
def unpack_bits(mat, m):
n = mat.shape[0]
assert mat.shape[1] == (m+63)//64
res = np.zeros((n, m), np.uint64)
for i in nb.prange(n):
for bj in range(0, m, 64):
val = np.uint64(mat[i, bj//64])
if bj + 64 <= m:
# Fast case
for j in range(64):
res[i, bj+j] = np.uint8((val >> (63 - j)) & 1)
else:
# Slow case (boundary)
for j in range(m - bj):
res[i, bj+j] = np.uint8((val >> (63 - j)) & 1)
return res
The np.unique function can be called on the much smaller packed array like in the initial code (except the resulting sorted array is a packed one and need to be unpacked). Since you do not need the indices, it is better not to compute it. Thus, return_index=True can be removed. Additionally, only the required values can be unpacked (unpacking is a bit more expensive than packing because writing a big matrix is more expensive than reading an existing one).
if __name__ == '__main__':
threshold = 2
n, m = 1800000, 108
mat = create_mat(n, m, 8)
print(f'Time: {datetime.datetime.now()}')
packed_mat = pack_bits(mat)
duplicate_packed_rows, duplicate_counts = np.unique(packed_mat, axis=0, return_counts=True)
duplicate_indices = [int(x) for x in np.argwhere(duplicate_counts >= threshold)]
print(f'Time: {datetime.datetime.now()}')
print(f'Duplicate rows: {len(duplicate_rows)} Sample inds: {duplicate_indices[0:5]} Sample counts: {duplicate_counts[0:5]}')
print(f'Sample rows:')
print(unpack_bits(duplicate_packed_rows[0:5], m))
Step 2: np.unique optimizations
The np.unique call is sub-optimal as it performs multiple expensive internal sorting steps. Not all of them are needed in your specific case and some step can be optimized.
A more efficient implementation consists in sorting the last column during a first step, then sorting the previous column, and so on until the first column is sorted similar to what a Radix sort does. Note that the last column can be sorted using a non-stable algorithm (generally faster) but the others need a stable one. This method is still sub-optimal as argsort calls are slow and the current implementation does not use multiple threads yet. Unfortunately, Numpy does not proving any efficient way to sort rows of a 2D array yet. While it is possible to reimplement this in Numba, this is cumbersome, a bit tricky to do and bug prone. Not to mention Numba introduce some overheads compared to a native C/C++ code. Once sorted, the unique/duplicate rows can be tracked and counted. Here is an implementation:
def sort_lines(mat):
n, m = mat.shape
for i in range(m):
kind = 'stable' if i > 0 else None
mat = mat[np.argsort(mat[:,m-1-i], kind=kind)]
return mat
#nb.njit('(uint64[:,::1],)', parallel=True)
def find_duplicates(sorted_mat):
n, m = sorted_mat.shape
assert m >= 0
isUnique = np.zeros(n, np.bool_)
uniqueCount = 1
if n > 0:
isUnique[0] = True
for i in nb.prange(1, n):
isUniqueVal = False
for j in range(m):
isUniqueVal |= sorted_mat[i, j] != sorted_mat[i-1, j]
isUnique[i] = isUniqueVal
uniqueCount += isUniqueVal
uniqueValues = np.empty((uniqueCount, m), np.uint64)
duplicateCounts = np.zeros(len(uniqueValues), np.uint64)
cursor = 0
for i in range(n):
cursor += isUnique[i]
for j in range(m):
uniqueValues[cursor-1, j] = sorted_mat[i, j]
duplicateCounts[cursor-1] += 1
return uniqueValues, duplicateCounts
The previous np.unique call can be replaced by find_duplicates(sort_lines(packed_mat)).
Step 3: GPU-based np.unique
While implementing a fast algorithm to sort row is not easy on CPU with Numba and Numpy, one can simply use CuPy to do that on the GPU assuming a Nvidia GPU is available and CUDA is installed (as well as CuPy). This solution has the benefit of being simple and significantly more efficient. Here is an example:
import cupy as cp
def cupy_sort_lines(mat):
cupy_mat = cp.array(mat)
return cupy_mat[cp.lexsort(cupy_mat.T[::-1,:])].get()
The previous sort_lines call can be replaced by cupy_sort_lines.
Results
Here are the timings on my machine with a 6-core i5-9600KF CPU and a Nvidia 1660 Super GPU:
Initial version: 15.541 s
Optimized packing: 0.982 s
Optimized np.unique: 0.634 s
GPU-based sorting: 0.143 s (require a Nvidia GPU)
Thus, the CPU-based optimized version is about 25 times faster and the GPU-based one is 109 times faster. Note that the sort take a significant time in all versions. Also, please note that the unpacking is not included in the benchmark (as seen in the provided code). It takes a negligible time as long as only few rows are unpacked and not all the full array (which takes roughtly ~200 ms on my machine). This last operation can be further optimized at the expense of a significantly more complex implementation.

Just to share a naive solution as a baseline:
version 1:
def get_duplicate_indexes(data):
signature_to_indexes = {}
index = 0
for row in data:
key = row.data.tobytes()
if key not in signature_to_indexes:
signature_to_indexes[key] = [index]
else:
signature_to_indexes[key].append(index)
index = index + 1
return [
indexes
for indexes in signature_to_indexes.values()
if len(indexes) > 1
]
In [122]: %timeit get_duplicate_indexes(mat)
833 ms ± 5.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [125]: %time get_duplicate_indexes(mat)
CPU times: user 1.5 s, sys: 87.1 ms, total: 1.59 s
Wall time: 1.59 s
In [123]: get_duplicate_indexes(mat)
Out[123]:
[[1396, 402980],
[769782, 1421364],
[875866, 1476693],
[892483, 1194500],
[1230863, 1478688],
[1311189, 1426136]]
version 2 (after discussing with #Kelly Bundy):
def get_duplicate_indexes(data):
signature_to_indexes = {}
duplicates = {}
for index, row in enumerate(data):
key = row.data.tobytes()
if key not in signature_to_indexes:
signature_to_indexes[key] = index
else:
indexes = signature_to_indexes[key]
if isinstance(indexes, int):
duplicates[key] = signature_to_indexes[key] = [indexes, index]
else:
indexes.append(index)
return list(duplicates.values())
In [146]: %timeit get_duplicate_indexes(mat)
672 ms ± 3.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [147]: %time get_duplicate_indexes(mat)
CPU times: user 652 ms, sys: 46.6 ms, total: 699 ms
Wall time: 698 ms
JFYI: initial version took ~14s on my machine.

If you don't want to use bit packing or calling np.unique on multidimensional array you could also create view. It's slower than any other optimized ways but rather educational:
def create_view(arr): # a, b are arrays
arr = np.ascontiguousarray(arr)
void_dt = np.dtype((np.void, arr.dtype.itemsize * arr.shape[1]))
return arr.view(void_dt).ravel()
def create_mat(n, m, nz):
rng = np.random.default_rng()
data = rng.permuted(np.full((n, m), [1] * nz + [0] * (m - nz), dtype=np.uint8), axis=1)
return data
if __name__ == '__main__':
mat = create_mat(1800000, 108, 8)
mat_view = create_view(mat)
u, idx, counts = np.unique(mat_view, axis=0, return_index=True, return_counts=True)
duplicate_indices = np.flatnonzero(counts >= 2)
print(f'Duplicate inds: {duplicate_indices}')
print(f'Duplicate counts: {counts[duplicate_indices]}')
print(f'Unique rows: {len(u)}')
Note that I have fixed some code as well. create_mat could be done in a more efficient way. Runtime is ~ 3 seconds :)

I would suggest scipy.stats.rankdata, with your favorite axis as axis. Notice that with method set to min, the unique values in the resulting arrays give you the indices of the unique rows.

How to split a list using two nested conditions

Basically I have list of 0s and 1s. Each value in the list represents a data sample from an hour. Thus, if there are 24 0s and 1s in the list that means there are 24 hours, or a single day. I want to capture the first time the data cycles from 0s to 1s back to 0s in a span of 24 hours (or vice versa from 1s to 0s back to 1s).
signal = [1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1]
expected output:
# D
signal = [1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0]
output = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
# ^ cycle.1:day.1 |dayline ^cycle.1:day.2
In the output list, when there is 1 that means 1 cycle is completed at that position of the signal list and at rest of the position there are 0. There should only 1 cycle in a days that's why only 1 is there.
I don't how to split this list according to that so can someone please help?

It seams to me like what you are trying to do is split your data first into blocks of 24, and then to find either the first rising edge, or the first falling edge depending on the first hour in that block.
Below I have tried to distill my understanding of what you are trying to accomplish into the following function. It takes in a numpy.array containing zeros and ones, as in your example. It checks to see what the first hour in the day is, and decides what type of edge to look for.
it detects an edge by using np.diff. This gives us an array containing -1's, 0's, and 1's. We then look for the first index of either a -1 falling edge, or 1 rising edge. The function returns that index, or if no edges were found it returns the index of the last element, or nothing.
For more info see the docs for descriptions on numpy features used here np.diff, np.array.nonzero, np.array_split
import numpy as np
def get_cycle_index(day):
'''
returns the first index of a cycle defined by nipun vats
if no cycle is found returns nothing
'''
first_hour = day[0]
if first_hour == 0:
edgetype = -1
else:
edgetype = 1
edges = np.diff(np.r_[day, day[-1]])
if (edges == edgetype).any():
return (edges == edgetype).nonzero()[0][0]
elif (day.sum() == day.size) or day.sum() == 0:
return
else:
return day.size - 1
Below is an example of how you might use this function in your case.
import numpy as np
_data = [1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
#_data = np.random.randint(0,2,280, dtype='int')
data = np.array(_data, 'int')
#split the data into a set of 'day' blocks
blocks = np.array_split(data, np.arange(24,data.size, 24))
_output = []
for i, day in enumerate(blocks):
print(f'day {i}')
buffer = np.zeros(day.size, dtype='int')
print('\tsignal:', *day, sep = ' ')
cycle_index = get_cycle_index(day)
if cycle_index:
buffer[cycle_index] = 1
print('\toutput:', *buffer, sep=' ')
_output.append(buffer)
output = np.concatenate(_output)
print('\nfinal output:\n', *output, sep=' ')
this yeilds the following output:
day 0
signal: 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0
output: 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
day 1
signal: 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
output: 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
day 2
signal: 0 0 0 0 0 0
output: 0 0 0 0 0 0
final output:
0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Find all the blocks

I am very new to python and coding. I have this homework that I have to do:
You will receive on the first line the rows of the matrix (n) and on the next n lines you will get each row of the matrix as a string (zeros and ones separated by a single space). You have to calculate how many blocks you have (connected ones horizontally or diagonally) Here are examples:
Input:
5
1 1 0 0 0
1 1 0 0 0
0 0 0 0 0
0 0 0 1 1
0 0 0 1 1
Output:
2
Input:
6
1 1 0 1 0 1
0 1 1 1 1 1
0 1 0 0 0 0
0 1 1 0 0 0
0 1 1 1 1 0
0 0 0 1 1 0
Output:
1
Input:
4
0 1 0 1 1 0
1 0 1 1 0 1
1 0 0 0 0 0
0 0 0 1 0 0
Output:
5
the code I came up with for now is :
n = int(input())
blocks = 0
matrix = [[int(i) for i in input().split()] for j in range(n)]
#loop or something to find the blocks in the matrix
print(blocks)
Any help will be greatly appreciated.

def valid(y,x):
if y>=0 and x>=0 and y<N and x<horizontal_len:
return True
def find_blocks(y,x):
Q.append(y)
Q.append(x)
#search around 4 directions (up, right, left, down)
dy = [0,1,0,-1]
dx = [1,0,-1,0]
# if nothing is in Q then terminate counting block
while Q:
y = Q.pop(0)
x = Q.pop(0)
for dir in range(len(dy)):
next_y = y + dy[dir]
next_x = x + dx[dir]
#if around component is valid range(inside the matrix) and it is 1(not 0) then include it as a part of block
if valid(next_y,next_x) and matrix[next_y][next_x] == 1:
Q.append(next_y)
Q.append(next_x)
matrix[next_y][next_x] = -1
N = int(input())
matrix = []
for rows in range(N):
row = list(map(int, input().split()))
matrix.append(row)
#row length
horizontal_len = len(matrix[0])
blocks = 0
#search from matrix[0][0] to matrix[N][horizontal_len]
for start_y in range(N):
for start_x in range(horizontal_len):
#if a number is 1 then start calculating
if matrix[start_y][start_x] == 1:
#make 1s to -1 for not to calculate again
matrix[start_y][start_x] = -1
Q=[]
#start function
find_blocks(start_y, start_x)
blocks +=1
print(blocks)
I used BFS algorithm to solve this question. The quotations are may not enough to understand the logic.
If you have questions about this solution, let me know!

Converting this operation from matlab to python

I have this line in some matlab script that Im trying to convert to python. So, m=20, and n=20. The dimensions of I_true equals [400,1].
I want to convert following Matlab code:
A=zeros((2*m*n),(2*m*n)+2);
A(1:m*n,(2*m*n)+1)=-I_true(:);
Am I converting it right?
Converted code in Python:
for i in range(0,m*n):
for j in range((2*m*n)+1):
A[i][j] = I_true[i]

Let's look at a small example, with n = 2, m = 2:
In Octave (and presumably Matlab):
octave:50> m = 2; n = 2;
octave:51> I_true = [1;2;3;4];
octave:52> A = zeros((2*m*n),(2*m*n)+2);
octave:53> A(1:m*n,(2*m*n)+1)=-I_true(:)
A =
0 0 0 0 0 0 0 0 -1 0
0 0 0 0 0 0 0 0 -2 0
0 0 0 0 0 0 0 0 -3 0
0 0 0 0 0 0 0 0 -4 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
The equivalent in Python (with n = 20, m = 20) would be
import numpy as np
n, m = 20, 20
I_true = np.arange(1, n*m+1) # just as an example
A = np.zeros((2*m*n, 2*(n*m+1)), dtype=I.dtype)
A[:m*n, 2*m*n] = -I_true
The reason why the last line uses A[:m*n, 2*m*n] and not A[1:m*n, (2*m*n)+1] is
because Python uses 0-based indexing whereas Matlab uses 1-based indexing.

Check this so question as well.
You can define a matrix with 2*m*n rows and 2*m*n+2 columns in python like this:
m = 20
n = 20
a = [[0 for i in range(2*m*n)] for j in range((2*m*n)+2)]
Now you have your matrix you can assign values to its elements using different ways. One example would be using for loops to assign values from another matrix with same size:
for i in range(2*m*n):
for j in range((2*m*n)+2):
a[i][j] = I_true[i][j]
I hope it helps.

Python: Constructing & Printing matrices

I need to create a matrix that calculates the LCS and then print it out. This is my code, but I'm having trouble with the print function (don't know how to get the LCSmatrix values into the printing)
def compute_LCS(seqA, seqB):
for row in seqA:
for col in seqB:
if seqA[row] == seqB[col]:
if row==0 or col==0:
LCSmatrix(row,col) = 1
else:
LCSmatrix(row,col) = LCS(row-1,col-1) + 1
else:
LCSmatrix(row,col) = 0
return LCSmatrix
def printMatrix(parameters...):
print ' ',
for i in seqA:
print i,
print
for i, element in enumerate(LCSMatrix):
print i, ' '.join(element)
matrix = LCSmatrix
print printMatrix(compute_LCS(seqA,seqB))
Any help would be much appreciated.

Try this:
seqA='AACTGGCAG'
seqB='TACGCTGGA'
def compute_LCS(seqA, seqB):
LCSmatrix = [len(seqB)*[0] for row in seqA]
for row in range(len(seqB)):
for col in range(len(seqA)):
if seqB[row] == seqA[col]:
if row==0 or col==0:
LCSmatrix[row][col] = 1
else:
LCSmatrix[row][col] = LCSmatrix[row-1][col-1] + 1_
else:
LCSmatrix[row][col] = 0
return LCSmatrix
def printMatrix(seqA, seqB, LCSmatrix):
print ' '.join('%2s' % x for x in ' '+seqA)
for i, element in enumerate(LCSmatrix):
print '%2s' % seqB[i], ' '.join('%2i' % x for x in element)
matrix = compute_LCS(seqA, seqB)
printMatrix(seqA, seqB, matrix)
The above produces:
A A C T G G C A G
T 0 0 0 1 0 0 0 0 0
A 1 1 0 0 0 0 0 1 0
C 0 0 2 0 0 0 1 0 0
G 0 0 0 0 1 1 0 0 1
C 0 0 1 0 0 0 2 0 0
T 0 0 0 2 0 0 0 0 0
G 0 0 0 0 3 1 0 0 1
G 0 0 0 0 1 4 0 0 1
A 1 1 0 0 0 0 0 1 0

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Counting repeated sequences in transition table - python

Figured it out! Switched to the following: def make_table(allSeq): n = max([ max(s) for s in allSeq ]) + 1 arr = np.zeros((n,n), dtype=int) for seq in allSeq: for i,j in zip(seq[1:],seq[:-1]): ind = (i,j) arr[ind] += 1 return pd.DataFrame(arr).rename_axis(index='Next', columns='Current')

Related

How to accelerate numpy.unique and provide both counts and duplicate row indices

How to split a list using two nested conditions

Find all the blocks

Converting this operation from matlab to python

Python: Constructing & Printing matrices

Categories

Resources