I have a pandas dataframe which has around 350 columns and 500000 rows initially:
import string
import numpy as np
import pandas as pd
import itertools
cols = list(string.ascii_lowercase) + [i+j for i,j in [*itertools.combinations(list(string.ascii_lowercase), 2)]]
df = pd.DataFrame({col: np.repeat([np.random.randint(2)], [500000]) for col in cols})
I need to add 3000 new columns to my dataframe (initialized to 0) where the value of each row depends on the values in the existing rows (I use a mask when testing for this):
for i, j, k in itertools.combinations(list(string.ascii_lowercase), 3):
df[i+j+k] = 0
df.loc[(df[i] > 0) & (df[j] > 0) & (df[k] > 0) & (df[i + j] + df[i + k] + df[j + k] >= 2), i+j+k] = 1
However, the issue is that the above loop is extremely slow! Is there a way to optimize the procedure above? Perhaps using a pandas lookup-function that is faster?
Here you have a much faster solution. When you get stuck in performance problems of this type, try switching to Numpy. It is way faster!
Assure it fits in your memory before you run it :D
import string
import numpy as np
import pandas as pd
import itertools
cols = list(string.ascii_lowercase) + [i+j for i,j in [*itertools.combinations(list(string.ascii_lowercase), 2)]]
df = pd.DataFrame({col: np.repeat([np.random.randint(2)], [500000]) for col in cols})
mat = df.values # Convert to numpy, much faster
additional_cols = []
for i, j, k in itertools.combinations(list(string.ascii_lowercase), 3):
cond_1 = (mat[:,cols.index(i)]*mat[:,cols.index(j)]*mat[:,cols.index(k)])>0 # Singles
cond_2 = (mat[:,cols.index(i+j)]+mat[:,cols.index(i+k)] + mat[:,cols.index(j+k)])>2 # Multiples
col = (cond_1 & cond_2) + 0 # +0 transforms to int
additional_cols.append((i+j+k, col))
df_additional = pd.DataFrame(dict(additional_cols)) # Assure it fits in memory
df = pd.concat([df, df_additional], axis=1) # Assure it fits in memory
Related
Say I have the following DataFrame:
df = pd.DataFrame(np.arange(10).reshape(5,2),columns=list('AB'))
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9
And I wish to output each column header followed by the column concatenated as a string like so:
'''A
02468
B
13579'''
I can do like so with a for loop:
for col in df.columns:
print(col, df[col].astype(str).str.cat(), sep='\n')
but I have a large number of columns - is there a more efficient way to do this?
Try converting the columns to str with astype, joining them together, then take advantage to to_csv's ability to create formatted data setting the separator to newlines, and exclude the header:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
s = df.astype(str).apply(''.join).to_csv(sep='\n', header=False)
print(s)
s:
A
02468
B
13579
I was interested in the timings so I made a perfplot:
import numpy as np
import pandas as pd
import perfplot
def make_data(n):
if n // 2 == 0:
return pd.DataFrame(columns=list('AB'))
df = pd.DataFrame(np.arange(n).reshape(n // 2, 2), columns=list('AB'))
return df
def for_option(df):
s = ''
for k, v in df.astype(str).to_dict('list').items():
s += f"{k}\n{''.join(v)}\n"
return s
def apply_option_to_csv(df):
s = df.astype(str).apply(''.join).to_csv(sep='\n', header=False)
return s
def apply_option_for(df):
s = ''
for k, v in zip(df.columns, df.astype(str).apply(''.join)):
s += f"{k}\n{v}\n"
return s
if __name__ == '__main__':
out = perfplot.bench(
setup=make_data,
kernels=[
for_option,
apply_option_to_csv,
apply_option_for
],
labels=['for option', 'apply option (to csv)', 'apply option (for)'],
n_range=[2 ** k for k in range(25)],
equality_check=None
)
out.save('res.png', transparent=False)
It appears to_csv has some overhead which makes it overall less efficient than other options. In terms of apply(''.join) vs to_dict('list').items() and joining each value they appear to behave similarly at larger values, but Scott Boston's solution is significantly faster for smaller frames.
Try this:
for k,v in df.astype(str).to_dict('list').items():
print(k)
print(''.join(v))
It may be faster than using df.apply you'll have to test with your dataframe.
I have a very large DataFrame where one column (COL) includes a range (i.e. list) of values. I want to turn this COL into individual columns labeled with the specific number and containing a 1 if the specific number is in COL else 0.
Below is my current approach. However, this is slow with high number of OBSERVATIONS and MAX_VALUE.
import pandas as pd
import numpy as np
OBSERVATIONS = 100000 # number of values 600000
MAX_VALUE = 400 # 400
_ = pd.DataFrame({
'a':np.random.randint(2,20,OBSERVATIONS),
'b':np.random.randint(30,MAX_VALUE,OBSERVATIONS)
})
_['res'] = _.apply(lambda x: range(x['a'],x['b']),axis=1)
for i in range(MAX_VALUE):
_[f'{i}'] = _['res'].apply(lambda x: 1 if i in x else 0)
You can try and do the calculations in numpy and then insert the numpy array to the dataframe. This is about 5 times faster:
import pandas as pd
import numpy as np
import time
OBSERVATIONS = 100_000 # number of values 600000
MAX_VALUE = 400 # 400
_ = pd.DataFrame({
'a':np.random.randint(2,20,OBSERVATIONS),
'b':np.random.randint(30,MAX_VALUE,OBSERVATIONS)
})
_['res'] = _.apply(lambda x: range(x['a'],x['b']),axis=1)
res1 = _.copy()
start = time.time()
for i in range(MAX_VALUE):
res1[f'{i}'] = res1['res'].apply(lambda x: 1 if i in x else 0)
print(f'original: {time.time() - start}')
start = time.time()
z = np.zeros((len(_), MAX_VALUE), dtype=np.int64)
for i,r in enumerate(_.res):
z[i,range(r.start,r.stop)]=1
res2 = pd.concat([_, pd.DataFrame(z)], axis=1)
res2.columns = list(map(str, res2.columns))
print(f'new : {time.time() - start}')
assert res1.equals(res2)
Output:
original: 23.649751663208008
new : 4.586429595947266
I need to find a more efficient solution for the following problem:
Given is a dataframe with 4 variables in each row. I need to find the list of 8 elements that includes all the variables per row in a maximum amount of rows.
A working, but very slow, solution is to create a second dataframe containing all possible combinations (basically a permutation without repetation). Then loop through every combination and compare it wit the inital dataframe. The amount of solutions is counted and added to the second dataframe.
import numpy as np
import pandas as pd
from itertools import combinations
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
df = 'x' + df.astype(str)
listofvalues = df['A'].tolist()
listofvalues.extend(df['B'].tolist())
listofvalues.extend(df['C'].tolist())
listofvalues.extend(df['D'].tolist())
listofvalues = list(dict.fromkeys(listofvalues))
possiblecombinations = list(combinations(listofvalues, 6))
dfcombi = pd.DataFrame(possiblecombinations, columns = ['M','N','O','P','Q','R'])
dfcombi['List'] = dfcombi.M.map(str) + ',' + dfcombi.N.map(str) + ',' + dfcombi.O.map(str) + ',' + dfcombi.P.map(str) + ',' + dfcombi.Q.map(str) + ',' + dfcombi.R.map(str)
dfcombi['Count'] = ''
for x, row in dfcombi.iterrows():
comparelist = row['List'].split(',')
pointercounter = df.index[(df['A'].isin(comparelist) == True) & (df['B'].isin(comparelist) == True) & (df['C'].isin(comparelist) == True) & (df['D'].isin(comparelist) == True)].tolist()
row['Count'] = len(pointercounter)
I assume there must be a way to avoid the for - loop and replace it with some pointer, i just can not figure out how.
Thanks!
Your code can be rewritten as:
# working with integers are much better than strings
enums, codes = df.stack().factorize()
# encodings of df
s = [set(x) for x in enums.reshape(-1,4)]
# possible combinations
from itertools import combinations, product
possiblecombinations = np.array([set(x) for x in combinations(range(len(codes)), 6)])
# count the combination with issubset
ret = [0]*len(possiblecombinations)
for a, (i,b) in product(s, enumerate(possiblecombinations)):
ret[i] += a.issubset(b)
# the combination with maximum count
max_combination = possiblecombinations[np.argmax(ret)]
# in code {0, 3, 4, 5, 17, 18}
# and in values:
codes[list(max_combination)]
# Index(['x5', 'x15', 'x12', 'x8', 'x0', 'x6'], dtype='object')
All that took about 2 seconds as oppose to your code that took around 1.5 mins.
I have two matrices. One is of size (CxK) and another is of size (SxK) (where S,C, and K all have the potential to be very large). I want to combine these an output matrix using the cosine similarity function (would be of size [CxS]). When I run my code, it takes a very long time to produce an output, and I was wondering if there is any way to optimize what I currently have. [Note, the two input matrices are often very sparse]
I was previously traversing each matrix using two for index,row loops, but I have since switched to the while loops, which improved my run time significantly.
A #this is one of my input matrices (pandas dataframe)
B #this is my second input matrix (pandas dataframe)
C = pd.DataFrame(columns = ['col_1' ,'col_2' ,'col_3'])
i=0
k=0
while i <= 5:
col_1 = A.iloc[i].get('label_A')
while k < 5:
col_2 = B.iloc[k].get('label_B')
propensity = cosine_similarity([A.drop('label_A', axis=1)\
.iloc[i]], [B.drop('label_B',axis=1).iloc[k]])
d = {'col_1':[col_1], 'col_2':[col_2], 'col_3':[propensity[0][0]]}
to_append = pd.DataFrame(data=d)
C = C.append(to_append)
k += 1
k = 0
i += 1
Right now I have the loops to run on only 5 items from each matrix, producing a 5x5 matrix, but I would obviously like this to work for very large inputs. This is the first time I have done anything like this so please let me know if any facet of code can be improved (data types used to hold matrices, how to traverse them, updating the output matrix, etc.).
Thank you in advance.
This can be done much more easyly and way faster by passing the whole arrays to cosine_similarity after you move the labels to the index:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time
c = 50
s = 50
k = 100
A = pd.DataFrame( np.random.rand(c,k))
B = pd.DataFrame( np.random.rand(s,k))
A['label_A'] = [f'A{i}' for i in range(c)]
B['label_B'] = [f'B{i}' for i in range(s)]
C = pd.DataFrame()
# your program
start = time.time()
i=0
k=0
while i < c:
col_1 = A.iloc[i].get('label_A')
while k < s:
col_2 = B.iloc[k].get('label_B')
propensity = cosine_similarity([A.drop('label_A', axis=1)\
.iloc[i]], [B.drop('label_B',axis=1).iloc[k]])
d = {'col_1':[col_1], 'col_2':[col_2], 'col_3':[propensity[0][0]]}
to_append = pd.DataFrame(data=d)
C = C.append(to_append)
k += 1
k = 0
i += 1
print(f'elementwise: {time.time() - start:7.3f} s')
# my solution
start = time.time()
A = A.set_index('label_A')
B = B.set_index('label_B')
C1 = pd.DataFrame(cosine_similarity(A, B), index=A.index, columns=B.index).stack().rename('col_3')
C1.index.rename(['col_1','col_2'], inplace=True)
C1 = C1.reset_index()
print(f'whole array: {time.time() - start:7.3f} s')
# verification
assert(C[['col_1','col_2']].to_numpy()==C1[['col_1','col_2']].to_numpy()).all()\
and np.allclose(C.col_3.to_numpy(), C1.col_3.to_numpy())
I have 2-dimensional data (Column-Cell1,Cell2.., Row-Gene1,Gene2..) in which I want to delete rows with 99% zeroes and with the resultant matrix drop columns with 99% zeroes in them. I have written the following code to do the same, however since the matrix is very large, it is taking a long time to run. Is there a better way to approach this issue?
import pandas as pd
import numpy as np
def read_in(matrix_file):
matrix_df=pd.read_csv(matrix_file,index_col=0)
return(matrix_df)
def genes_less_exp(matrix_df):
num_columns=matrix_df.shape[1]
for index, row in matrix_df.iterrows():
zero_els=np.count_nonzero(row.values==0)
gene_per_zero=(float(zero_els)/float(num_columns))*100
if gene_per_zero >= 99:
matrix_df.drop([index],axis=0,inplace=True)
return(matrix_df)
def cells_less_exp(matrix_df):
num_rows=matrix_df.shape[0]
for label,content in matrix_df.iteritems():
zero_els=np.count_nonzero(content.values==0)
cells_per_zero=(float(zero_els)/float(num_rows))*100
if cells_per_zero >= 99:
matrix_df.drop(label,axis=1,inplace=True)
return(matrix_df)
if __name__ == "__main__":
matrix_df=read_in("Data/big-matrix.csv")
print("original:"+str(matrix_df.shape))
filtered_genes=genes_less_exp(matrix_df)
print("filtered_genes:"+str(filtered_genes.shape))
filtered_cells=cells_less_exp(filtered_genes)
print("filtered_cells:"+str(filtered_cells.shape))
filtered_cells.to_csv("abi.99.percent.filtered.csv", sep=',')
Its easier if you reframe your question to "keep those with less than 99% 0s".
def drop_almost_zero(df, percentage):
row_cut_off = int(percentage/100*len(df.columns))
df = df[(df==0).sum(axis='columns') <= row_cut_off]
column_cut_off = int(percentage/100*len(df))
b = (df == 0).sum(axis='rows')
df = df[ b[ b <= column_cut_off].index.values ]
return df
#test
size = 50
percentage = 90
rows = size//2
columns = size
a = np.random.choice(2, size=(rows, columns), p=[(1-0.1), 0.1])
df = pd.DataFrame(a, columns=[f'c{i}' for i in range(size)])
df = drop_almost_zero(df,percentage)
assert (df == 0).sum(axis='rows').max() <= percentage/100*rows
assert (df == 0).sum(axis='columns').max() <= percentage/100*columns