I have a very large DataFrame where one column (COL) includes a range (i.e. list) of values. I want to turn this COL into individual columns labeled with the specific number and containing a 1 if the specific number is in COL else 0.
Below is my current approach. However, this is slow with high number of OBSERVATIONS and MAX_VALUE.
import pandas as pd
import numpy as np
OBSERVATIONS = 100000 # number of values 600000
MAX_VALUE = 400 # 400
_ = pd.DataFrame({
'a':np.random.randint(2,20,OBSERVATIONS),
'b':np.random.randint(30,MAX_VALUE,OBSERVATIONS)
})
_['res'] = _.apply(lambda x: range(x['a'],x['b']),axis=1)
for i in range(MAX_VALUE):
_[f'{i}'] = _['res'].apply(lambda x: 1 if i in x else 0)
You can try and do the calculations in numpy and then insert the numpy array to the dataframe. This is about 5 times faster:
import pandas as pd
import numpy as np
import time
OBSERVATIONS = 100_000 # number of values 600000
MAX_VALUE = 400 # 400
_ = pd.DataFrame({
'a':np.random.randint(2,20,OBSERVATIONS),
'b':np.random.randint(30,MAX_VALUE,OBSERVATIONS)
})
_['res'] = _.apply(lambda x: range(x['a'],x['b']),axis=1)
res1 = _.copy()
start = time.time()
for i in range(MAX_VALUE):
res1[f'{i}'] = res1['res'].apply(lambda x: 1 if i in x else 0)
print(f'original: {time.time() - start}')
start = time.time()
z = np.zeros((len(_), MAX_VALUE), dtype=np.int64)
for i,r in enumerate(_.res):
z[i,range(r.start,r.stop)]=1
res2 = pd.concat([_, pd.DataFrame(z)], axis=1)
res2.columns = list(map(str, res2.columns))
print(f'new : {time.time() - start}')
assert res1.equals(res2)
Output:
original: 23.649751663208008
new : 4.586429595947266
Related
I have a dataframe with 3 millions of rows (df1) and another with 10k rows (df2). What is the fastest method of filtering df1 for each row in df2?
Here is exactly what I need to do in the loop:
for i in list(range(len(df2))): #For each row
x = df1[(df1['column1'].isin([df2['info1'][i]])) \
& (df1['column2'].isin([df2['info2'][i]])) \
& (df1['column3'].isin([df2['info3'][i]]))]
# ..... More code using x variable every time ......
This code is not fast enough to be viable.
Note that I used .isin function but inside it there´s always only 1 item. I found out that using .isin() , df1['column1'].isin([df2['info1'][i]] , was faster then using df1['column1'] == df2['info1'][i] .
import pandas as pd
import numpy as np
def make_filter(x, y, match_dict, uinque=False):
filter = None
for x_key in x.columns:
if x_key in match_dict:
y_key = match_dict[x_key]
y_col = y[y_key]
if uinque:
y_col = y_col.unique()
col_filter = x[x_key].isin(y[y_key])
if filter is None:
filter = col_filter
else:
filter = filter & col_filter
return filter
def main():
n_rows = 100
x = np.random.randint(4, size=(n_rows, 2))
x = pd.DataFrame(x, columns=["col1", "col2"])
y = np.random.randint(2, 4, size=(n_rows, 2))
y = pd.DataFrame(y, columns=["info1", "info2"])
match_dict = {"col1":"info1", "col2": "info2"}
z = make_filter(x, y, match_dict, uinque=True)
print(x[z])
main()
I have 2-dimensional data (Column-Cell1,Cell2.., Row-Gene1,Gene2..) in which I want to delete rows with 99% zeroes and with the resultant matrix drop columns with 99% zeroes in them. I have written the following code to do the same, however since the matrix is very large, it is taking a long time to run. Is there a better way to approach this issue?
import pandas as pd
import numpy as np
def read_in(matrix_file):
matrix_df=pd.read_csv(matrix_file,index_col=0)
return(matrix_df)
def genes_less_exp(matrix_df):
num_columns=matrix_df.shape[1]
for index, row in matrix_df.iterrows():
zero_els=np.count_nonzero(row.values==0)
gene_per_zero=(float(zero_els)/float(num_columns))*100
if gene_per_zero >= 99:
matrix_df.drop([index],axis=0,inplace=True)
return(matrix_df)
def cells_less_exp(matrix_df):
num_rows=matrix_df.shape[0]
for label,content in matrix_df.iteritems():
zero_els=np.count_nonzero(content.values==0)
cells_per_zero=(float(zero_els)/float(num_rows))*100
if cells_per_zero >= 99:
matrix_df.drop(label,axis=1,inplace=True)
return(matrix_df)
if __name__ == "__main__":
matrix_df=read_in("Data/big-matrix.csv")
print("original:"+str(matrix_df.shape))
filtered_genes=genes_less_exp(matrix_df)
print("filtered_genes:"+str(filtered_genes.shape))
filtered_cells=cells_less_exp(filtered_genes)
print("filtered_cells:"+str(filtered_cells.shape))
filtered_cells.to_csv("abi.99.percent.filtered.csv", sep=',')
Its easier if you reframe your question to "keep those with less than 99% 0s".
def drop_almost_zero(df, percentage):
row_cut_off = int(percentage/100*len(df.columns))
df = df[(df==0).sum(axis='columns') <= row_cut_off]
column_cut_off = int(percentage/100*len(df))
b = (df == 0).sum(axis='rows')
df = df[ b[ b <= column_cut_off].index.values ]
return df
#test
size = 50
percentage = 90
rows = size//2
columns = size
a = np.random.choice(2, size=(rows, columns), p=[(1-0.1), 0.1])
df = pd.DataFrame(a, columns=[f'c{i}' for i in range(size)])
df = drop_almost_zero(df,percentage)
assert (df == 0).sum(axis='rows').max() <= percentage/100*rows
assert (df == 0).sum(axis='columns').max() <= percentage/100*columns
Hello there I would like to iterate over the row CPB% and add the computations to a related column called 'Proba'. My dataframe looks like this:
What I tried so far looks like this:
bins = np.linspace(0, 1, num=100)
dCPB = df['CPB%']
df['binnedB'] = pd.cut(dCPB, bins)
dfnew = pd.DataFrame(pd.cut(df['CPB%'], bins=bins).value_counts()).sort_index(ascending = True)
dfnew['binned'] = dfnew.index
total = dfnew['CPB%'].sum()
idx = total
for index,row in dfnew.iterrows():
idx = idx - row['CPB%']
row['Proba'] = float(idx) / float(total)
But my iteration does not update my empty column Proba, any idea why? Thanks!
I think the problem is, you are assigning the result back to the row, which doesn't get stored anywhere. instead you can do:
proba = []
for index, row in dfnew.iterrows():
idx = idx - row['CPB%']
proba.append(float(idx) / float(total))
dfnew['Proba'] = proba
However, this is not the best way, you can use .apply with axis=1 to do row-wise calculations on a data frame.
You can use pd.Series.cumsum to perform your iterative deductions:
total = dfnew['CPB%'].sum()
dfnew['Proba'] = 1 - df['CPB%'].cumsum() / total
With Pandas you should look to vectorise algorithms, which usually involves column-wise operations as opposed to a row-wise for loop. Here's a complete demonstration:
df = pd.DataFrame({'A': list(range(1, 7))})
def jpp(df):
total = df['A'].sum()
df['Proba'] = 1 - df['A'].cumsum() / total
return df
def yolo(df):
total = df['A'].sum()
idx = total
proba = []
for index, row in df.iterrows():
idx = idx - row['A']
proba.append(float(idx) / float(total))
df['Proba'] = proba
return df
# check results are the same
assert df.pipe(jpp).equals(df.pipe(yolo))
%timeit df.pipe(jpp) # 691 µs
%timeit df.pipe(yolo) # 840 µs
I need to split dataframe into 10 parts then use one part as the testset and remaining 9 (merged to use as training set) , I have come up to the following code where I am able to split the dataset , and m trying to merge the remaining sets after picking one of those 10.
The first iteration goes fine , but I get following error in second iteration.
df = pd.DataFrame(np.random.randn(10, 4), index=list(xrange(10)))
for x in range(3):
dfList = np.array_split(df, 3)
testdf = dfList[x]
dfList.remove(dfList[x])
print testdf
traindf = pd.concat(dfList)
print traindf
print "================================================"
I don't think you have to split the dataframe in 10 but just in 2.
I use this code for splitting a dataframe in training set and validation set:
test_index = np.random.choice(df.index, int(len(df.index)/10), replace=False)
test_df = df.loc[test_index]
train_df = df.loc[~df.index.isin(test_index)]
okay I got it working this way :
df = pd.DataFrame(np.random.randn(10, 4), index=list(xrange(10)))
dfList = np.array_split(df, 3)
for x in range(3):
trainList = []
for y in range(3):
if y == x :
testdf = dfList[y]
else:
trainList.append(dfList[y])
traindf = pd.concat(trainList)
print testdf
print traindf
print "================================================"
But better approach is welcome.
You can use the permutation function from numpy.random
import numpy as np
import pandas as pd
import math as mt
l = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
df = pd.DataFrame({'a': l, 'b': l})
shuffle the dataframe index
shuffled_idx = np.random.permutation(df.index)
divide the shuffled_index into N equal(ish) parts
for this example, let N = 4
N = 4
n = len(shuffled_idx) / N
parts = []
for j in range(N):
parts.append(shuffled_idx[mt.ceil(j*n): mt.ceil(j*n+n)])
# to show each shuffled part of the data frame
for k in parts:
print(df.iloc[k])
I wrote a piece of script find / fork it on github for the purpose of splitting a Pandas dataframe randomly. Here's a link to Pandas - Merge, join, and concatenate functionality!
Same code for your reference:
import pandas as pd
import numpy as np
from xlwings import Sheet, Range, Workbook
#path to file
df = pd.read_excel(r"//PATH TO FILE//")
df.columns = [c.replace(' ',"_") for c in df.columns]
x = df.columns[0].encode("utf-8")
#number of parts the data frame or the list needs to be split into
n = 7
seq = list(df[x])
np.random.shuffle(seq)
lists1 = [seq[i:i+n] for i in range(0, len(seq), n)]
listsdf = pd.DataFrame(lists1).reset_index()
dataframesDict = dict()
# calling xlwings workbook function
Workbook()
for i in range(0,n):
if Sheet.count() < n:
Sheet.add()
doubles[i] =
df.loc[df.Column_Name.isin(list(listsdf[listsdf.columns[i+1]]))]
Range(i,"A1").value = doubles[i]
Looks like you are trying to do a k-fold type thing, rather than a one-off. This code should help. You may also find the SKLearn k-fold functionality works in your case, that's also worth checking out.
# Split dataframe by rows into n roughly equal portions and return list of
# them.
def splitDf(df, n) :
splitPoints = list(map( lambda x: int(x*len(df)/n), (list(range(1,n)))))
splits = list(np.split(df.sample(frac=1), splitPoints))
return splits
# Take splits from splitDf, and return into test set (splits[index]) and training set (the rest)
def makeTrainAndTest(splits, index) :
# index is zero based, so range 0-9 for 10 fold split
test = splits[index]
leftLst = splits[:index]
rightLst = splits[index+1:]
train = pd.concat(leftLst+rightLst)
return train, test
You can then use these functions to make the folds
df = <my_total_data>
n = 10
splits = splitDf(df, n)
trainTest = []
for i in range(0,n) :
trainTest.append(makeTrainAndTest(splits, i))
# Get test set 2
test2 = trainTest[2][1].shape
# Get training set zero
train0 = trainTest[0][0]
I have two pandas DataFrames A and B, with columns ['start', 'end', 'value'] but not the same number of rows. I'd like to set the values for each row in A as follows:
A.iloc(i) = B['value'][B['start'] < A[i,'start'] & B['end'] > A[i,'end']]
There is a possibility of multiple rows of B satisfy this condition for each i, in that case max or sum of corresponding rows would be the result. In case if none satisfies the value of A.iloc[i] should not be updated or set to a default value of 0 (either way would be fine)
I'm interested to find the most efficient way of doing this.
import numpy as np
np.random.seed(1)
lenB = 10
lenA = 20
B_start = np.random.rand(lenB)
B_end = B_start + np.random.rand(lenB)
B_value = np.random.randint(100, 200, lenB)
A_start = np.random.rand(lenA)
A_end = A_start + np.random.rand(lenA)
#if you use dataframe
#B_start = B["start"].values
#B_end = ...
mask = (A_start[:, None ] > B_start) & (A_end[:, None] < B_end)
r, c = np.where(mask)
result = pd.Series(B_value[c]).groupby(r).max()
print result