Say I have the following DataFrame:
df = pd.DataFrame(np.arange(10).reshape(5,2),columns=list('AB'))
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9
And I wish to output each column header followed by the column concatenated as a string like so:
'''A
02468
B
13579'''
I can do like so with a for loop:
for col in df.columns:
print(col, df[col].astype(str).str.cat(), sep='\n')
but I have a large number of columns - is there a more efficient way to do this?
Try converting the columns to str with astype, joining them together, then take advantage to to_csv's ability to create formatted data setting the separator to newlines, and exclude the header:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
s = df.astype(str).apply(''.join).to_csv(sep='\n', header=False)
print(s)
s:
A
02468
B
13579
I was interested in the timings so I made a perfplot:
import numpy as np
import pandas as pd
import perfplot
def make_data(n):
if n // 2 == 0:
return pd.DataFrame(columns=list('AB'))
df = pd.DataFrame(np.arange(n).reshape(n // 2, 2), columns=list('AB'))
return df
def for_option(df):
s = ''
for k, v in df.astype(str).to_dict('list').items():
s += f"{k}\n{''.join(v)}\n"
return s
def apply_option_to_csv(df):
s = df.astype(str).apply(''.join).to_csv(sep='\n', header=False)
return s
def apply_option_for(df):
s = ''
for k, v in zip(df.columns, df.astype(str).apply(''.join)):
s += f"{k}\n{v}\n"
return s
if __name__ == '__main__':
out = perfplot.bench(
setup=make_data,
kernels=[
for_option,
apply_option_to_csv,
apply_option_for
],
labels=['for option', 'apply option (to csv)', 'apply option (for)'],
n_range=[2 ** k for k in range(25)],
equality_check=None
)
out.save('res.png', transparent=False)
It appears to_csv has some overhead which makes it overall less efficient than other options. In terms of apply(''.join) vs to_dict('list').items() and joining each value they appear to behave similarly at larger values, but Scott Boston's solution is significantly faster for smaller frames.
Try this:
for k,v in df.astype(str).to_dict('list').items():
print(k)
print(''.join(v))
It may be faster than using df.apply you'll have to test with your dataframe.
Related
I have .csv file with colums A and B, and rows with values.
A,
B
232.65,
-57.48
22.69,
-5.46
23.67,
-7.71
I want to loop these values with function.
So lets create simple function with 2 positional parameters:
def add_numbers(n1, n2):
answer = n1+n2
print(answer)
#lets read the file:
import pandas as pd
df= pd.read_csv(r'myfile.csv')
print(df[A]) #just checking the column A for this example, but column B is also required for n2.
0 232.65
1 22.69
3 23.67
I could also transfer those colums to array but I still could not loop it. How would I loop these with the function that requires two arguments?
arr = np.loadtxt(r'myfile.csv', delimiter=',')
arr
array([[232.65, -57.48],
[22.69, -5.46],
[23.67, -7.71],
I have been trying to loop with various ways and iter and enumerate and apply, but I keep doing something slightly wrong.
Cheer, Joonatan
You can loop and pass the values in each row to add_numbers. Then use iterrows to get the index and row values.
def add_numbers(n1, n2):
answer = n1 + n2
print(answer)
import pandas as pd
df = pd.read_csv(r'myfile.csv')
for index, row in df.iterrows():
add_numbers(row['A'], row['B'])
I hope, it works for your solution. Whenever you have to apply a custom function and want to loop through values to perform an operation use apply function.
Code:
import pandas as pd
df = pd.read_csv('./loop_through_2_columns.csv')
def add_numbers(n1, n2):
answer = n1+n2
return answer
df['result'] = df.apply(lambda x: add_numbers(x.A, x.B), axis=1)
df.head()
Output:
A B result
0 232.65 -57.48 175.17
1 22.69 -5.46 17.23
2 23.67 -7.71 15.96
try:
import pandas as pd
df = pd.DataFrame({
'A': [232.65,22.69,23.67],
'B':[-57.48,-5.46,-7.71]
})
def add_numbers(n1, n2):
answer = n1+n2
print(answer)
for i in range(len(df)):
n1 = df.loc[i,'A']
n2 = df.loc[i,'B']
add_numbers(n1,n2)
output:
175.17000000000002
17.23
15.96
I have a pandas dataframe which has around 350 columns and 500000 rows initially:
import string
import numpy as np
import pandas as pd
import itertools
cols = list(string.ascii_lowercase) + [i+j for i,j in [*itertools.combinations(list(string.ascii_lowercase), 2)]]
df = pd.DataFrame({col: np.repeat([np.random.randint(2)], [500000]) for col in cols})
I need to add 3000 new columns to my dataframe (initialized to 0) where the value of each row depends on the values in the existing rows (I use a mask when testing for this):
for i, j, k in itertools.combinations(list(string.ascii_lowercase), 3):
df[i+j+k] = 0
df.loc[(df[i] > 0) & (df[j] > 0) & (df[k] > 0) & (df[i + j] + df[i + k] + df[j + k] >= 2), i+j+k] = 1
However, the issue is that the above loop is extremely slow! Is there a way to optimize the procedure above? Perhaps using a pandas lookup-function that is faster?
Here you have a much faster solution. When you get stuck in performance problems of this type, try switching to Numpy. It is way faster!
Assure it fits in your memory before you run it :D
import string
import numpy as np
import pandas as pd
import itertools
cols = list(string.ascii_lowercase) + [i+j for i,j in [*itertools.combinations(list(string.ascii_lowercase), 2)]]
df = pd.DataFrame({col: np.repeat([np.random.randint(2)], [500000]) for col in cols})
mat = df.values # Convert to numpy, much faster
additional_cols = []
for i, j, k in itertools.combinations(list(string.ascii_lowercase), 3):
cond_1 = (mat[:,cols.index(i)]*mat[:,cols.index(j)]*mat[:,cols.index(k)])>0 # Singles
cond_2 = (mat[:,cols.index(i+j)]+mat[:,cols.index(i+k)] + mat[:,cols.index(j+k)])>2 # Multiples
col = (cond_1 & cond_2) + 0 # +0 transforms to int
additional_cols.append((i+j+k, col))
df_additional = pd.DataFrame(dict(additional_cols)) # Assure it fits in memory
df = pd.concat([df, df_additional], axis=1) # Assure it fits in memory
I'm having a problem with an old function computing the concentration of pandas categorical columns. There seems to have been a change making it impossible to subset the result of the .value_counts() method of a categorical series.
Minimal non-working example:
import pandas as pd
import numpy as np
df = pd.DataFrame({"A":["a","b","c","a"]})
def get_concentration(df,cat):
tmp = df[cat].astype("category")
counts = tmp.value_counts()
obs = len(tmp)
all_cons = []
for key in counts.keys():
single = np.square(np.divide(float(counts[key]),float(obs)))
all_cons.append(single)
return np.sum(all_cons)
get_concentration(df, "A")
This results in a key error for counts["a"]. I'm quite sure this worked in a past version of pandas and the documentation doesn't seem to mention a change regarding the .value_counts() method.
Let's agree on methodology:
>>> df.A.value_counts()
a 2
b 1
c 1
obs = len((df['A'].astype('category'))
>>> obs
4
The concentration should be as follows (per the Herfindahl Index):
>>> (2 / 4.) ** 2 + (1 / 4.) ** 2 + (1 / 4.) ** 2
0.375
Which is equivalent to (Pandas 0.17+):
>>> ((df.A.value_counts() / df.A.count()) ** 2).sum()
0.375
If you really want a function:
def concentration(df, col):
return ((df[col].value_counts() / df[col].count()) ** 2).sum()
>>> concentration(df, 'A')
0.375
Since you're iterating in a loop (and not working vectorically), you might as well just explicitly iterate over pairs. It simplifies the syntax, IMHO:
import pandas as pd
import numpy as np
df = pd.DataFrame({"A":["a","b","c","a"]})
def get_concentration(df,cat):
tmp = df[cat].astype("category")
counts = tmp.value_counts()
obs = len(tmp)
all_cons = []
# See change in following line - you're anyway iterating
# over key-value pairs; why not do so explicitly?
for k, v in counts.to_dict().items():
single = np.square(np.divide(float(v),float(obs)))
all_cons.append(single)
return np.sum(all_cons)
>>> get_concentration(df, "A")
0.25
To fix the current function, you just need to access the index values using .ix (see below). You might be better of using a vectorized function - I've addend one at the end.
df = pd.DataFrame({"A":["a","b","c","a"]})
tmp = df[cat].astype('category')
counts = tmp.value_counts()
obs = len(tmp)
all_cons = []
for key in counts.index:
single = np.square(np.divide(float(counts.ix[key]), float(obs)))
all_cons.append(single)
return np.sum(all_cons)
yields:
get_concentration(df, "A")
0.25
You might want to try a vectorized version, which also doesn't necessarily need the category dtype, such as:
def get_concentration(df, cat):
counts = df[cat].value_counts()
return counts.div(len(counts)).pow(2).sum()
I have a bunch of data (10M + records) that breaks down to an identifier, a location and a date. I want to find the number of times that any identifier moved from some locationA to some other locationB over the entire set of dates. Any identifier may not have a location for all possible dates. When an identifier does not have a location recorded, that should be treated as an actual 'unknown' location for that date.
Here is some reproducible fake data...
import numpy as np
import pandas as pd
import datetime
base = datetime.date.today()
num_days = 50
dates = np.array([base - datetime.timedelta(days=x) for x in range(num_days-1, -1, -1)])
ids = np.arange(50)
mi = pd.MultiIndex.from_product([ids, dates])
locations = np.array([chr(x) for x in 97 + np.random.randint(26, size=len(mi))])
s = pd.Series(locations, index=mi)
mask = np.random.rand(len(mi)) > .5
s[mask] = np.nan
s = s.dropna()
My initial thought was to create a dataframe and use boolean masking/vectorized operations to solve this
df = s.unstack(0).fillna('unknown')
Apparently my data is sparse enough to cause a MemoryError (from all the extra entries resulting from unstacking).
My current working solution is the following
def series_fn(s):
s = s.reindex(pd.date_range(s.index.levels[1].min(), s.index.levels[1].max()), level=-1).fillna('unknown')
mask_prev = (s != s.shift(-1))[:-1]
mask_next = (s != s.shift())[1:]
s_prev = s[:-1][mask_prev]
s_next = s[1:][mask_next]
s_tup = pd.Series(list(zip(s_prev, s_next)))
return s_tup.value_counts()
result_per_id = s.groupby(level=0).apply(series_fn)
result = result_per_id.sum(level=-1)
result looks like
(a, b) 1
(a, c) 5
(a, e) 3
(a, f) 3
(a, g) 3
(a, h) 3
(a, i) 1
(a, j) 1
(a, k) 2
(a, l) 2
...
This is going to take ~5 hours for all my data. Does anyone know any faster ways of doing this?
Thanks!
Hmmm, I guess I should have transposed the data... well that was a relatively simple fix. Instead of using groupby and apply,
s = s.reorder_levels(['date', 'id'])
s = s.sortlevel(0)
results = []
for i in range(len(s.index.levels[0])-1):
t = time.time()
s0 = s.loc[s.index.levels[0][i]]
s1 = s.loc[s.index.levels[0][i+1]]
df = pd.concat((s0, s1), axis=1)
# Note: this is slower than the line above
# df = s.loc[s.index.levels[0][0:2], :].unstack(0)
df = df.fillna('unknown')
mi = pd.MultiIndex.from_arrays((df.iloc[:, 0], df.iloc[:, 1]))
s2 = pd.Series(1, mi)
res = s2.groupby(level=[0, 1]).apply(np.sum)
results.append(res)
print(time.time() - t)
results = pd.concat(results, axis=1)
Still unclear on why the commented out section takes about three times as long as the three lines above it.
I have a two dimensional (or more) pandas DataFrame like this:
>>> import pandas as pd
>>> df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
>>> df
A B
0 0 1
1 2 3
2 4 5
Now suppose I have a numpy array like np.array([2,3]) and want to check if there is any row in df that matches with the contents of my array. Here the answer should obviously true but eg. np.array([1,2]) should return false as there is no row with both 1 in column A and 2 in column B.
Sure this is easy but don't see it right now.
Turns out it is really easy, the following does the job here:
>>> ((df['A'] == 2) & (df['B'] == 3)).any()
True
>>> ((df['A'] == 1) & (df['B'] == 2)).any()
False
Maybe somebody comes up with a better solution which allows directly passing in the array and the list of columns to match.
Note that the parenthesis around df['A'] == 2 are not optional since the & operator binds just as strong as the == operator.
an easier way is:
a = np.array([2,3])
(df == a).all(1).any()
If you also want to return the index where the matches occurred:
index_list = df[(df['A'] == 2)&(df['B'] == 3)].index.tolist()
To find rows where a single column equals a certain value:
df[df['column name'] == value]
To find rows where multiple columns equal different values, Note the inner ():
df[(df["Col1"] == Value1 & df["Col2"] == Value2 & ....)]
a simple solution with dictionary
def check_existance(dict_of_values, df):
v = df.iloc[:, 0] == df.iloc[:, 0]
for key, value in dict_of_values.items():
v &= (df[key] == value)
return v.any()
import pandas as pd
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
this_row_exists = {'A':2, 'B':3}
check_existance(this_row_exists, df)
# True
this_row_does_not_exist = {'A':2, 'B':5}
check_existance(this_row_does_not_exist, df)
# False
An answer that works with larger dataframes so you don't need to manually check for each columns:
import pandas as pd
import numpy as np
#define variables
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
a = np.array([2,3])
def check_if_np_array_is_in_df(df, a):
# transform a into a dataframe
da = pd.DataFrame(np.expand_dims(a,axis=0), columns=['A','B'])
# drop duplicates from df
ddf=df.drop_duplicates()
result = pd.concat([ddf,da]).shape[0] - pd.concat([ddf,da]).drop_duplicates().shape[0]
return result
print(check_if_np_array_is_in_df(df, a))
print(check_if_np_array_is_in_df(df, [1,3]))
If you want to return the row where the matches occurred:
resulting_row = df[(df['A'] == 2)&(df['B'] == 3)].values