Count of all possible combinations between dataframe columns - python

I'm trying to get the count (where all row values are 1) of each possible combination between the eight columns of a dataframe. Basically I need to understand how many times different overlaps exist.
I've tried to use itertools.product to get all the combinations, but it doesn't seem to work.
import pandas as pd
import numpy as np
import itertools
df = pd.read_excel('filename.xlsx')
df.head(15)
a b c d e f g h
0 1 0 0 0 0 1 0 0
1 1 0 0 0 0 0 0 0
2 1 0 1 1 1 1 1 1
3 1 0 1 1 0 1 1 1
4 1 0 0 0 0 0 0 0
5 0 1 0 0 1 1 1 1
6 1 1 0 0 1 1 1 1
7 1 1 1 1 1 1 1 1
8 1 1 0 0 1 1 0 0
9 1 1 1 0 1 0 1 0
10 1 1 1 0 1 1 0 0
11 1 0 0 0 0 1 0 0
12 1 1 1 1 1 1 1 1
13 1 1 1 1 1 1 1 1
14 0 1 1 1 1 1 1 0
print(list(itertools.product(new_df.columns)))
The expected output would be a dataframe with the count (n) of rows for each of valid combinations (where values in the row are all 1).
For example:
a b
0 1 0
1 1 0
2 1 0
3 1 0
4 1 0
5 0 1
6 1 1
7 1 1
8 1 1
9 1 1
10 1 1
11 1 0
12 1 1
13 1 1
14 0 1
Would give
combination count
a 12
a_b 7
b 9
Note that the output would need to contain all the combinations possible between a and h, not just pairwise

Powerset Combinations
Use the powerset recipe with,
s = pd.Series({
'_'.join(c): df[c].min(axis=1).sum()
for c in map(list, filter(None, powerset(df)))
})
a 13
b 9
c 8
d 6
e 10
f 12
g 9
h 7
a_b 7
...
Pairwise Combinations
This is a special case, and can be vectorized.
from itertools import combinations
u = df.T.dot(df)
pd.DataFrame({
'combination': [*map('_'.join, combinations(df, 2))],
# pandas < 0.24
# 'count': u.values[np.triu_indices_from(u, k=1)]
# pandas >= 0.24
'count': u.to_numpy()[np.triu_indices_from(u, k=1)]
})
You can use dot, then extract the upper triangular matrix values:
combination count
0 a_b 7
1 a_c 7
2 a_d 5
3 a_e 8
4 a_f 10
5 a_g 7
6 a_h 6
7 b_c 6
8 b_d 4
9 b_e 9

As you happen to have 8 columns, np.packbits together with
np.bincount is rather convenient here:
import numpy as np
import pandas as pd
# make large example
ncol, nrow = 8, 1_000_000
df = pd.DataFrame(np.random.randint(0,2,(nrow,ncol)), columns=list("abcdefgh"))
from time import time
T = [time()]
# encode as binary numbers and count
counts = np.bincount(np.packbits(df.values.astype(np.uint8)),None,256)
# find sets in other sets
rng = np.arange(256, dtype=np.uint8)
contained = (rng & rng[:, None]) == rng[:, None]
# and sum
ccounts = (counts * contained).sum(1)
# if there are empty bins, remove them
nz = np.where(ccounts)[0].astype(np.uint8)
# helper to build bin labels
a2h = np.array(list("abcdefgh"))
# put labels to counts
result = pd.Series(ccounts[nz], index = ["_".join((*a2h[np.unpackbits(i).view(bool)],)) for i in nz])
from itertools import chain, combinations
def powerset(iterable):
"powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
T.append(time())
s = pd.Series({
'_'.join(c): df[c].min(axis=1).sum()
for c in map(list, filter(None, powerset(df)))
})
T.append(time())
print("packbits {:.3f} powerset {:.3f}".format(*np.diff(T)))
print("results equal", (result.sort_index()[1:]==s.sort_index()).all())
This gives the same result as the powerset approach but literally 1000x faster:
packbits 0.016 powerset 21.974
results equal True

If you have just values of 1 and 0, you could do:
df= pd.DataFrame({
'a': [1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1],
'b': [1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0],
'c': [1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1],
'd': [1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1],
})
(df.a * df.b).sum()
This results in 4.
To get all combinations you can use combinations from itertools:
from itertools import combinations
analyze=[(col,) for col in df.columns]
analyze.extend(combinations(df.columns, 2))
for cols in analyze:
num_ser= None
for col in cols:
if num_ser is None:
num_ser= df[col]
else:
num_ser*= df[col]
num= num_ser.sum()
print(f'{cols} contains {num}')
This results in:
('a',) contains 4
('b',) contains 7
('c',) contains 11
('d',) contains 23
('a', 'b') contains 4
('a', 'c') contains 4
('a', 'd') contains 4
('b', 'c') contains 7
('b', 'd') contains 7
('c', 'd') contains 11

Cooccurence matrix is all you need:
Let's construct an example first:
import numpy as np
import pandas as pd
mat = np.zeros((5,5))
mat[0,0] = 1
mat[0,1] = 1
mat[1,0] = 1
mat[2,1] = 1
mat[3,3] = 1
mat[3,4] = 1
mat[2,4] = 1
cols = ['a','b','c','d','e']
df = pd.DataFrame(mat,columns=cols)
print(df)
a b c d e
0 1.0 1.0 0.0 0.0 0.0
1 1.0 0.0 0.0 0.0 0.0
2 0.0 1.0 0.0 0.0 1.0
3 0.0 0.0 0.0 1.0 1.0
4 0.0 0.0 0.0 0.0 0.0
Now we construct the cooccurence matrix:
# construct the cooccurence matrix:
co_df = df.T.dot(df)
print(co_df)
a b c d e
a 2.0 1.0 0.0 0.0 0.0
b 1.0 2.0 0.0 0.0 1.0
c 0.0 0.0 0.0 0.0 0.0
d 0.0 0.0 0.0 1.0 1.0
e 0.0 1.0 0.0 1.0 2.0
Finally the result you need:
result = {}
for c1 in cols:
for c2 in cols:
if c1 == c2:
if c1 not in result:
result[c1] = co_df[c1][c2]
else:
if '_'.join([c1,c2]) not in result:
result['_'.join([c1,c2])] = co_df[c1][c2]
print(result)
{'a': 2.0, 'a_b': 1.0, 'a_c': 0.0, 'a_d': 0.0, 'a_e': 0.0, 'b_a': 1.0, 'b': 2.0, 'b_c': 0.0, 'b_d': 0.0, 'b_e': 1.0, 'c_a': 0.0, 'c_b': 0.0, 'c': 0.0, 'c_d': 0.0, 'c_e': 0.0, 'd_a': 0.0, 'd_b': 0.0, 'd_c': 0.0, 'd': 1.0, 'd_e': 1.0, 'e_a': 0.0, 'e_b': 1.0, 'e_c': 0.0, 'e_d': 1.0, 'e': 2.0}

Related

Calculate the average of sections of a column with condition met to create new dataframe

I have the below data table
A = [2, 3, 1, 2, 4, 1, 5, 3, 1, 7, 5]
B = [0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0]
df = pd.DataFrame({'A':A, 'B':B})
I'd like to calculate the average of column A when consecutive rows see column B equal to 1. All rows where column B equal to 0 are neglected and subsequently create a new dataframe like below:
Thanks for your help!
Keywords: groupby, shift, mean
Code:
df_result=df.groupby((df['B'].shift(1,fill_value=0)!= df['B']).cumsum()).mean()
df_result=df_result[df_result['B']!=0]
df_result
A B
1 2.0 1.0
3 3.0 1.0
As you might noticed, you need first to determine the consecutive rows blocks having the same values.
One way to do so is by shifting B one row and then comparing it with itself.
df['B_shifted']=df['B'].shift(1,fill_value=0) # fill_value=0 to return int and replace Nan with 0's
df['A'] =[2, 3, 1, 2, 4, 1, 5, 3, 1, 7, 5]
df['B'] =[0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0]
df['B_shifted'] =[0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0]
(df['B_shifted'] != df['B'])=[F, T, F, F, T, F, T, F, F, T, F]
[↑ ][↑ ][↑ ][↑ ]
Now we can use the groupby pandas method as follows:
df_grouped=df.groupby((df['B_shifted'] != df['B']).cumsum())
Now if we looped in the DtaFrameGroupBy object df_grouped
we'll see the following tuples:
(0, A B B_shifted
0 2 0 0)
(1, A B B_shifted
1 3 1 0
2 1 1 1
3 2 1 1)
(2, A B B_shifted
4 4 0 1
5 1 0 0)
(3, A B B_shifted
6 5 1 0
7 3 1 1
8 1 1 1)
(4, A B B_shifted
9 7 0 1
10 5 0 0)
We can simply calculate the mean and filter the zero values now as follow
df_result=df_grouped.mean()
df_result=df_result[df_result['B']!=0][['A','B']]
References:(link, link).
Try:
m = (df.B != df.B.shift(1)).cumsum() * df.B
df_out = df.groupby(m[m > 0])["A"].mean().reset_index(drop=True).to_frame()
df_out["B"] = 1
print(df_out)
Prints:
A B
0 2 1
1 3 1
df1 = df.groupby((df['B'].shift() != df['B']).cumsum()).mean().reset_index(drop=True)
df1 = df1[df1['B'] == 1].astype(int).reset_index(drop=True)
df1
Output
A B
0 2 1
1 3 1
Explanation
We are checking if each row's value of B is not equal to next value using pd.shift, if so then we are grouping those values and calculating its mean and assigning it to new dataframe df1.
Since we have mean of groups of all consecutive 0s and 1s, so we are then filtering only values of B==1.

How to get array of numbers using ones() in numpy?

Hi I have a code in Matlab which is generating the following sequence.
[ones(1,6*2) 2 ones(1,6*2-1) 2 ones(1,6*2) 1]
ans =
Columns 1 through 18
1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
Columns 19 through 36
1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
Columns 37 through 38
1 1
I want to generate similar array of numbers in Python.
I have tried to generate as follows.
ConvStride = [np.ones((12,),dtype=int),2,np.ones((11,),dtype=int),2,np.ones((12,),dtype=int),1]
Ans= [1 1 1 1 1... 1],2,[1 1 1 ... 1],2,[1 1 1 1....1],1
ConvStride = [np.ones((12,),dtype=int),2,np.ones((11,),dtype=int),2,np.ones((12,),dtype=int),1]
required
[ 1 1 1 .....1 2 1 1 1 .....1 2 111....1 1]
Could you please let me know a work around here.
Use np.r_:
np.r_[np.ones(12,int),2,np.ones(11,int),2,np.ones(12,int)]
# array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
# 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
You can create a list using a similar python syntax and then convert it to a numpy array
import numpy as np
x = [1]*(6*2) + [2] + [1]*(6*2-1) + [2] + [1]*(6*2) + [1]
ans = np.array(x)
If you want to do it all with numpy you can use hstack.
np.hstack([np.ones(6*2, int), 2, np.ones(6*2-1, int), 2, np.ones(6*2, int), 1])

Pandas DataFrame column concatenation

I have a pandas Dataframe y with 1 million rows and 5 columns.
np.shape(y)
(1037889, 5)
The column values are all 0 or 1. Looks something like this:
y.head()
a, b, c, d, e
0, 0, 1, 0, 0
1, 0, 0, 1, 1
0, 1, 1, 1, 1
0, 0, 0, 0, 0
I want a Dataframe with 1 million rows and 1 column.
np.shape(y)
(1037889, )
where the column is just the 5 columns concatenated together.
New column
0, 0, 1, 0, 0
1, 0, 0, 1, 1
0, 1, 1, 1, 1
0, 0, 0, 0, 0
I keep trying different things like merge, concat, dstack, etc...
but can't seem to figure this out.
If you want new column to have all data concatenated to string, it's good case for apply() function:
>>> df = pd.DataFrame({'a':[0,1,0,0], 'b':[0,0,1,0], 'c':[1,0,1,0], 'd':[0,1,1,0], 'c':[0,1,1,0]})
>>> df
a b c d
0 0 0 0 0
1 1 0 1 1
2 0 1 1 1
3 0 0 0 0
>>> df2 = df.apply(lambda row: ','.join(map(str, row)), axis=1)
>>> df2
0 0,0,0,0
1 1,0,1,1
2 0,1,1,1
3 0,0,0,0

Python sage: How do I computer a nullspace (kernel) for a stoichiometric matrix?

In a desperate attempt to switch from Matlab to python, I am encountering the following problem:
In Matlab, I am able to define a matrix like:
N = [1 0 0 0 -1 -1 -1 0 0 0;% A
0 1 0 0 1 0 0 -1 -1 0;% B
0 0 0 0 0 1 0 1 0 -1;% C
0 0 0 0 0 0 1 0 0 -1;% D
0 0 0 -1 0 0 0 0 0 1;% E
0 0 -1 0 0 0 0 0 1 1]% F
The rational basis nullspace (kernel) can then be calculated by:
K_nur= null(N,'r')
And the orthonormal basis like:
K_nuo= null(N)
This outputs the following:
N =
1 0 0 0 -1 -1 -1 0 0 0
0 1 0 0 1 0 0 -1 -1 0
0 0 0 0 0 1 0 1 0 -1
0 0 0 0 0 0 1 0 0 -1
0 0 0 -1 0 0 0 0 0 1
0 0 -1 0 0 0 0 0 1 1
K_nur =
1 -1 0 2
-1 1 1 0
0 0 1 1
0 0 0 1
1 0 0 0
0 -1 0 1
0 0 0 1
0 1 0 0
0 0 1 0
0 0 0 1
K_nuo =
0.5933 0.1332 0.3070 -0.3218
-0.0930 0.0433 0.2029 0.7120
0.1415 0.0084 0.5719 0.2220
0.3589 0.1682 -0.0620 0.1682
-0.1628 0.4518 0.3389 -0.4617
0.3972 -0.4867 0.0301 -0.0283
0.3589 0.1682 -0.0620 0.1682
-0.0383 0.6549 -0.0921 0.1965
-0.2174 -0.1598 0.6339 0.0538
0.3589 0.1682 -0.0620 0.1682
I have been trying to replicate this in Python SAGE, but so far, I have had no success. My code looks like this:
st1= matrix([
[ 1, 0, 0, 0,-1,-1,-1, 0, 0, 0],
[ 0, 1, 0, 0, 1, 0, 0,-1,-1, 0],
[ 0, 0, 0, 0, 0, 1, 0, 1, 0,-1],
[ 0, 0, 0, 0, 0, 0, 1, 0, 0,-1],
[ 0, 0, 0,-1, 0, 0, 0, 0, 0, 1],
[ 0, 0,-1, 0, 0, 0, 0, 0, 1, 1]])
print st1
null2_or= transpose(st1).kernel()
null2_ra= transpose(st1).kernel().basis()
print "nullr2_or"
print null2_or
print "nullr2_ra"
print null2_ra
Note: The transpose was introduced after reading through some tutorials on this and has to do with the nature of SAGE automatically computing the kernel from the left (which in this case yields no result at all).
The problem with this now is: It DOES print me something... But not the right thing.
The output is as follows:
sage: load stochiometric.py
[ 1 0 0 0 -1 -1 -1 0 0 0]
[ 0 1 0 0 1 0 0 -1 -1 0]
[ 0 0 0 0 0 1 0 1 0 -1]
[ 0 0 0 0 0 0 1 0 0 -1]
[ 0 0 0 -1 0 0 0 0 0 1]
[ 0 0 -1 0 0 0 0 0 1 1]
nullr2_or
Free module of degree 10 and rank 4 over Integer Ring
Echelon basis matrix:
[ 1 0 0 1 0 0 1 1 -1 1]
[ 0 1 0 1 0 -1 1 2 -1 1]
[ 0 0 1 -1 0 1 -1 -2 2 -1]
[ 0 0 0 0 1 -1 0 1 0 0]
nullr2_ra
[
(1, 0, 0, 1, 0, 0, 1, 1, -1, 1),
(0, 1, 0, 1, 0, -1, 1, 2, -1, 1),
(0, 0, 1, -1, 0, 1, -1, -2, 2, -1),
(0, 0, 0, 0, 1, -1, 0, 1, 0, 0)
]
Upon closer inspection, you can see that the resulting kernel matrix (nullspace) looks similar, but is not the same.
Does anyone know what I need to do to get the same result as in Matlab and, if possible, how to obtain the orthonormal result (in Matlab called K_nuo).
I have tried to look through the tutorials, documentation etc., but so far, no luck.
There might be a way do this with SAGE builtin functions; I'm not sure.
However, if a numpy/python-based solution will do, then:
import numpy as np
def null(A, eps=1e-15):
"""
http://mail.scipy.org/pipermail/scipy-user/2005-June/004650.html
"""
u, s, vh = np.linalg.svd(A)
n = A.shape[1] # the number of columns of A
if len(s)<n:
expanded_s = np.zeros(n, dtype = s.dtype)
expanded_s[:len(s)] = s
s = expanded_s
null_mask = (s <= eps)
null_space = np.compress(null_mask, vh, axis=0)
return np.transpose(null_space)
st1 = np.matrix([
[ 1, 0, 0, 0,-1,-1,-1, 0, 0, 0],
[ 0, 1, 0, 0, 1, 0, 0,-1,-1, 0],
[ 0, 0, 0, 0, 0, 1, 0, 1, 0,-1],
[ 0, 0, 0, 0, 0, 0, 1, 0, 0,-1],
[ 0, 0, 0,-1, 0, 0, 0, 0, 0, 1],
[ 0, 0,-1, 0, 0, 0, 0, 0, 1, 1]])
K = null(st1)
print(K)
yields the orthonormal null space:
[[ 0.59330559 0.13320203 0.30701044 -0.32180406]
[-0.09297005 0.04333798 0.20286425 0.71195719]
[ 0.14147329 0.00837169 0.5718718 0.22197807]
[ 0.35886225 0.16816832 -0.06199711 0.16817506]
[-0.16275558 0.45177747 0.33887617 -0.46165922]
[ 0.39719892 -0.48674377 0.03013138 -0.0283199 ]
[ 0.35886225 0.16816832 -0.06199711 0.16817506]
[-0.03833668 0.65491209 -0.09212849 0.19649496]
[-0.21738895 -0.15979664 0.63386891 0.05380301]
[ 0.35886225 0.16816832 -0.06199711 0.16817506]]
this confirms the columns have the null space property:
print(np.allclose(st1*K, 0))
# True
and this confirms that K is orthonormal:
print(np.allclose(K.T*K, np.eye(4)))
# True
Something like this should work:
sage: st1= matrix([
[ 1, 0, 0, 0,-1,-1,-1, 0, 0, 0],
[ 0, 1, 0, 0, 1, 0, 0,-1,-1, 0],
[ 0, 0, 0, 0, 0, 1, 0, 1, 0,-1],
[ 0, 0, 0, 0, 0, 0, 1, 0, 0,-1],
[ 0, 0, 0,-1, 0, 0, 0, 0, 0, 1],
[ 0, 0,-1, 0, 0, 0, 0, 0, 1, 1]])
sage: K = st1.right_kernel(); K
Free module of degree 10 and rank 4 over Integer Ring
Echelon basis matrix:
[ 1 0 0 1 0 0 1 1 -1 1]
[ 0 1 0 1 0 -1 1 2 -1 1]
[ 0 0 1 -1 0 1 -1 -2 2 -1]
[ 0 0 0 0 1 -1 0 1 0 0]
sage: M = K.basis_matrix()
The gram_schmidt method gives a pair of matrices. Type M.gram_schmidt? to see the documentation.
sage: M.gram_schmidt() # rows are orthogonal, not orthonormal
(
[ 1 0 0 1 0 0 1 1 -1 1]
[ -1 1 0 0 0 -1 0 1 0 0]
[ 5/12 3/4 1 1/6 0 1/4 1/6 -1/12 5/6 1/6]
[ 12/31 -25/62 4/31 -9/62 1 -29/62 -9/62 10/31 17/62 -9/62],
[ 1 0 0 0]
[ 1 1 0 0]
[ -7/6 -3/4 1 0]
[ 1/6 1/2 -4/31 1]
)
sage: M.gram_schmidt()[0] # rows are orthogonal, not orthonormal
[ 1 0 0 1 0 0 1 1 -1 1]
[ -1 1 0 0 0 -1 0 1 0 0]
[ 5/12 3/4 1 1/6 0 1/4 1/6 -1/12 5/6 1/6]
[ 12/31 -25/62 4/31 -9/62 1 -29/62 -9/62 10/31 17/62 -9/62]
sage: M.change_ring(RDF).gram_schmidt()[0] # orthonormal
[ 0.408248290464 0.0 0.0 0.408248290464 0.0 0.0 0.408248290464 0.408248290464 -0.408248290464 0.408248290464]
[ -0.5 0.5 0.0 0.0 0.0 -0.5 0.0 0.5 0.0 0.0]
[ 0.259237923683 0.466628262629 0.622171016838 0.103695169473 0.0 0.15554275421 0.103695169473 -0.0518475847365 0.518475847365 0.103695169473]
[ 0.289303646409 -0.30135796501 0.0964345488031 -0.108488867403 0.747367753224 -0.349575239411 -0.108488867403 0.241086372008 0.204923416206 -0.108488867403]
The matrix st1 has integer entries, so Sage treats it as a matrix of integers, and tries to do as much as possible with integer arithmetic, and failing that, rational arithmetic. Because of this, Gram-Schmidt orthonormalization will fail, since it involves taking square roots. This is why the method change_ring(RDF) is there: RDF stands for Real Double Field. You could instead just change one entry of st1 from 1 to 1.0, and then it will treat st1 as a matrix over RDF from the start and you won't need to do this change_ring anywhere.
To expand on John's great answer, I think you just have two different bases for the same vector space. Note his use of right_kernel.
sage: st1= matrix([
....: [ 1, 0, 0, 0,-1,-1,-1, 0, 0, 0],
....: [ 0, 1, 0, 0, 1, 0, 0,-1,-1, 0],
....: [ 0, 0, 0, 0, 0, 1, 0, 1, 0,-1],
....: [ 0, 0, 0, 0, 0, 0, 1, 0, 0,-1],
....: [ 0, 0, 0,-1, 0, 0, 0, 0, 0, 1],
....: [ 0, 0,-1, 0, 0, 0, 0, 0, 1, 1]])
sage: st2 = matrix([[1,-1, 0, 2],
....: [-1, 1, 1, 0],
....: [ 0, 0, 1, 1],
....: [ 0, 0, 0, 1],
....: [ 1, 0, 0, 0],
....: [ 0,-1, 0, 1],
....: [ 0, 0, 0, 1],
....: [ 0, 1, 0, 0],
....: [ 0, 0, 1, 0],
....: [ 0, 0, 0, 1]])
sage: st2 = st2.transpose()
sage: st2
[ 1 -1 0 0 1 0 0 0 0 0]
[-1 1 0 0 0 -1 0 1 0 0]
[ 0 1 1 0 0 0 0 0 1 0]
[ 2 0 1 1 0 1 1 0 0 1]
sage: st1.right_kernel()
Free module of degree 10 and rank 4 over Integer Ring
Echelon basis matrix:
[ 1 0 0 1 0 0 1 1 -1 1]
[ 0 1 0 1 0 -1 1 2 -1 1]
[ 0 0 1 -1 0 1 -1 -2 2 -1]
[ 0 0 0 0 1 -1 0 1 0 0]
sage: st2.row_space()
Free module of degree 10 and rank 4 over Integer Ring
Echelon basis matrix:
[ 1 0 0 1 0 0 1 1 -1 1]
[ 0 1 0 1 0 -1 1 2 -1 1]
[ 0 0 1 -1 0 1 -1 -2 2 -1]
[ 0 0 0 0 1 -1 0 1 0 0]
Your spaces are the same, just different bases in Sage and Matlab.

numpy/pandas: How to convert a series of strings of zeros and ones into a matrix

I have a data that arrives in this format:
[
(1, "000010101001010101011101010101110101", "aaa", ... ),
(0, "111101010100101010101110101010111010", "bb", ... ),
(0, "100010110100010101001010101011101010", "ccc", ... ),
(1, "000010101001010101011101010101110101", "ddd", ... ),
(1, "110100010101001010101011101010111101", "eeee", ... ),
...
]
In tuple format, it looks like this:
(Y, X, other_info, ... )
At the end of the day, I need to train a classifier (e.g. sklearn.linear_model.logistic.LogisticRegression) using Y and X.
What's the most straightforward way to turn the string of ones and zeros into something like a np.array, so that I can run it through the classifier? Seems like there should be an easy answer here, but I haven't been able to think of/google one.
A few notes:
I'm already using numpy/pandas/sklearn, so anything in those libraries is fair game.
For a lot of what I'm doing, it's convenient to have the other_info columns together in a DataFrame
The strings are is pretty long (~20,000 columns), but the total data frame is not very tall (~500 rows).
Since you asked primarily for a way to convert a string of ones and zeros into a numpy array, I'll offer my solution as follows:
d = '0101010000' * 2000 # create a 20,000 long string of 1s and 0s
d_array = np.fromstring(d, 'int8') - 48 # 48 is ascii 0. ascii 1 is 49
This compares favourable to #DSM's solution in terms of speed:
In [21]: timeit numpy.fromstring(d, dtype='int8') - 48
10000 loops, best of 3: 35.8 us per loop
In [22]: timeit numpy.fromiter(d, dtype='int', count=20000)
100 loops, best of 3: 8.57 ms per loop
How about something like this:
Make the dataframe:
In [82]: v = [
....: (1, "000010101001010101011101010101110101", "aaa"),
....: (0, "111101010100101010101110101010111010", "bb"),
....: (0, "100010110100010101001010101011101010", "ccc"),
....: (1, "000010101001010101011101010101110101", "ddd"),
....: (1, "110100010101001010101011101010111101", "eeee"),
....: ]
In [83]:
In [83]: df = pandas.DataFrame(v)
We can use fromiter or array to get an ndarray:
In [84]: d ="000010101001010101011101010101110101"
In [85]: np.fromiter(d, int) # better: np.fromiter(d, int, count=len(d))
Out[85]:
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])
In [86]: np.array(list(d), int)
Out[86]:
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])
There might be a slick vectorized way to do this, but I'd just apply the obvious per-entry function to the values and get on with my day:
In [87]: df[1]
Out[87]:
0 000010101001010101011101010101110101
1 111101010100101010101110101010111010
2 100010110100010101001010101011101010
3 000010101001010101011101010101110101
4 110100010101001010101011101010111101
Name: 1
In [88]: df[1] = df[1].apply(lambda x: np.fromiter(x, int)) # better with count=len(x)
In [89]: df
Out[89]:
0 1 2
0 1 [0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 aaa
1 0 [1 1 1 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 1 0 bb
2 0 [1 0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 ccc
3 1 [0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 ddd
4 1 [1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 eeee
In [90]: df[1][0]
Out[90]:
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])

Categories

Resources