I have the following code to find the columns in a data frame with the lowest number of distinct values and list them.
import pandas as pd
df = pd.DataFrame({"A": [1,2,3], "B": [2,3,4],"C":[1,1,2],"D":[3,3,4]})
print(df)
unique_counts = df.nunique()
lowest_distinct = 100
#
#Find the lowest distinct count across all columns
#
for column_name, distinct_count in unique_counts.iteritems():
if distinct_count < lowest_distinct:
lowest_distinct = distinct_count
lowest_distinct_columns = []
#
#Collect the columns having that count
#
for column_name, distinct_count in unique_counts.iteritems():
if distinct_count == lowest_distinct:
lowest_distinct_columns.append(column_name)
#
#Get the columns and values returned as a data frame
#
melted_df = df.melt(value_vars=lowest_distinct_columns,var_name='column', value_name='value')
print(melted_df)
It feels a bit clunky so I'm wondering if there is a better way to do it? Ultimately I'm trying to get a list of the columns and values that have the lowest number of distinct values.
Any thoughts or tips appreciated.
Cheers
David
Does it do what you want:
unique_counts = df.nunique()
lowest_distinct = unique_counts.min()
lowest_distinct_columns = unique_counts[unique_counts == lowest_distinct].index.tolist()
result = pd.DataFrame({col: df[col].unique() for col in lowest_distinct_columns})
Use
In [114]: df[unique_count[unique_count == unique_count.min()].index].melt(
var_name='column', value_name='value')
Out[114]:
column value
0 C 1
1 C 1
2 C 2
3 D 3
4 D 3
5 D 4
For older versions of pandas (< v.20), consider apply to return a series:
unique_ser = df.apply(lambda col: col.nunique(), axis=0)
print(unique_ser)
# A 3
# B 3
# C 2
# D 2
lowest_unique_ser = unique_ser[unique_ser == unique_ser.min()]
print(lowest_unique_ser)
# C 2
# D 2
final_ser = df[lowest_unique_ser.index].apply(lambda col: col.unique().tolist(), axis=0)
print(final_ser)
# C (1, 2)
# D (3, 4)
Thank you for the responses. The 3 solutions to the first part of the problem work equally well and the 2 responses to the second part of the problem also work very well.
I'll need to use them in practice to see if there is any material difference in performance or behaviour but to summarise the complete solutions:
#Parfait's solution:
unique_ser = df.apply(lambda col: col.nunique(), axis=0)
print(unique_ser)
# A 3
# B 3
# C 2
# D 2
lowest_unique_ser = unique_ser[unique_ser == unique_ser.min()]
print(lowest_unique_ser)
# C 2
# D 2
final_ser = df[lowest_unique_ser.index].apply(lambda col: col.unique().tolist(), axis=0)
print(final_ser)
# C (1, 2)
# D (3, 4)
and #Priker's
unique_counts = df.nunique()
lowest_distinct = unique_counts.min()
lowest_distinct_columns = unique_counts[unique_counts ==
lowest_distinct].index.tolist()
result = pd.DataFrame({col: df[col].unique() for col in lowest_distinct_columns})
Use
df1 = pd.DataFrame({"A": [1,2,3], "B": [2,3,4],"C":[1,1,2],"D":[3,3,4]})
print(df1)
unique_counts = df1.nunique()
A B C D
0 1 2 1 3
1 2 3 1 3
2 3 4 2 4
unique_counts[unique_counts==unique_counts.min()]
C 2
D 2
dtype: int64
Related
I have a dataframe as follows:
df=pandas.DataFrame()
df['A'] = numpy.random.random(10)
df['B'] = numpy.random.random(10)
df['C'] = numpy.random.random(10)
df['Col_name'] = numpy.random.choice(['A','B','C'],size=10)
I want to obtain an output that uses 'Col_name' and the respective index of the dataframe row to lookup the value in the dataframe.
I can get the desired output this with .apply() follows:
df['output'] = df.apply(lambda x: x[ x['Col_name'] ], axis=1)
.apply() is slow over a large dataframe with it iterating row by row. Is there an obvious solution in pandas that is faster/vectorised?
You can also pick each column name (or give list of possible names) and then apply it as mask to filter your dataframe then pick values from desired column and assign them to all rows matching the mask. Then repeat this for another coulmn.
for column_name in df: #or: for column_name in ['A', 'B', 'C']
df.loc[df['Col_name']==column_name, 'output'] = df[column_name]
Rows that will not match any mask will have NaN values.
PS. Accodring to my test with 10000000 random rows - method with .apply() takes 2min 24s to finish while my method takes only 4,3s.
Use melt to flatten your dataframe and keep rows where Col_name equals to variable column:
df['output'] = df.melt('Col_name', ignore_index=False).query('Col_name == variable')['value']
print(df)
# Output
A B C Col_name output
0 0.202197 0.430735 0.093551 B 0.430735
1 0.344753 0.979453 0.999160 C 0.999160
2 0.500904 0.778715 0.074786 A 0.500904
3 0.050951 0.317732 0.363027 B 0.317732
4 0.722624 0.026065 0.424639 C 0.424639
5 0.578185 0.626698 0.376692 C 0.376692
6 0.540849 0.805722 0.528886 A 0.540849
7 0.918618 0.869893 0.825991 C 0.825991
8 0.688967 0.203809 0.734467 B 0.203809
9 0.811571 0.010081 0.372657 B 0.010081
Transformation after melt:
>>> df.melt('Col_name', ignore_index=False)
Col_name variable value
0 B A 0.202197
1 C A 0.344753
2 A A 0.500904 # keep
3 B A 0.050951
4 C A 0.722624
5 C A 0.578185
6 A A 0.540849 # keep
7 C A 0.918618
8 B A 0.688967
9 B A 0.811571
0 B B 0.430735 # keep
1 C B 0.979453
2 A B 0.778715
3 B B 0.317732 # keep
4 C B 0.026065
5 C B 0.626698
6 A B 0.805722
7 C B 0.869893
8 B B 0.203809 # keep
9 B B 0.010081 # keep
0 B C 0.093551
1 C C 0.999160 # keep
2 A C 0.074786
3 B C 0.363027
4 C C 0.424639 # keep
5 C C 0.376692 # keep
6 A C 0.528886
7 C C 0.825991 # keep
8 B C 0.734467
9 B C 0.372657
Update
Alternative with set_index and stack for #Rabinzel:
df['output'] = (
df.set_index('Col_name', append=True).stack()
.loc[lambda x: x.index.get_level_values(1) == x.index.get_level_values(2)]
.droplevel([1, 2])
)
print(df)
# Output
A B C Col_name output
0 0.209953 0.332294 0.812476 C 0.812476
1 0.284225 0.566939 0.087084 A 0.284225
2 0.815874 0.185154 0.155454 A 0.815874
3 0.017548 0.733474 0.766972 A 0.017548
4 0.494323 0.433719 0.979399 C 0.979399
5 0.875071 0.789891 0.319870 B 0.789891
6 0.475554 0.229837 0.338032 B 0.229837
7 0.123904 0.397463 0.288614 C 0.288614
8 0.288249 0.631578 0.393521 A 0.288249
9 0.107245 0.006969 0.367748 C 0.367748
import pandas as pd
import numpy as np
df=pd.DataFrame()
df['A'] = np.random.random(10)
df['B'] = np.random.random(10)
df['C'] = np.random.random(10)
df['Col_name'] = np.random.choice(['A','B','C'],size=10)
df["output"] = np.nan
Even though you do not like going row per row, I still routinely use loops to go through each row just to know where it breaks when it breaks. Here are two loops just to satisfy myself. The column is created ahead with na values becausethe loops needs it to be.
# each rows by index
for i in range(len(df)):
df['output'][i] = df[df['Col_name'][i]][i]
# each rows but by column name
for col in list(df["Col_name"]):
df.loc[:,'output'] = df.loc[:,col]
Here are some "non-loop" ways to do so.
df["output"] = df.lookup(df.index, df.Col_name)
df['output'] = np.where(np.isnan(df['output']), df[df['Col_name']], np.nan)
I have the following dataset
df = pd.DataFrame([[1,1000],[2,1000],[3,1000]])
df.columns = ["A","B"]
df
A B
0 1 1000
1 2 1000
2 3 1000
I would like to create a new column C that calculates:
if A = 1 then C = B*.8
if A = 2 then C = B*.1
if A = 3 then C = B*.05
if A = 4 then C = B*.025
...
...(going up to 10)
Is it best to create a function?
def calculation(x):
if x == 1:
return y*0.75
elif...
But im not quite sure how to work with multiple columns. Any help would be appreciated! Thanks
Use Series.map by dictionary and then multiple by B column:
d = {1:.8, 2:.1, 3:.05, 4:.025}
df['C'] = df['A'].map(d).mul(df.B)
I have many DataFrames that I need to merge.
Let's say:
base: id constraint
1 'a'
2 'b'
3 'c'
df_1: id value constraint
1 1 'a'
2 2 'a'
3 3 'a'
df_2: id value constraint
1 1 'b'
2 2 'b'
3 3 'b'
df_3: id value constraint
1 1 'c'
2 2 'c'
3 3 'c'
If I try and merge all of them (it'll be in a loop), I get:
a = pd.merge(base, df_1, on=['id', 'constraint'], how='left')
b = pd.merge(a, df_2, on=['id', 'constraint'], how='left')
c = pd.merge(b, df_3, on=['id', 'constraint'], how='left')
id constraint value value_x value_y
1 'a' 1 NaN NaN
2 'b' NaN 2 NaN
3 'c' NaN NaN 3
The desired output would be:
id constraint value
1 'a' 1
2 'b' 2
3 'c' 3
I know about the combine_first and it works, but I can't have this approach because it is thousands of time slower.
Is there a merge that can replace values in case of columns overlap?
It's somewhat similar to this question, with no answers.
Given your MCVE:
import pandas as pd
base = pd.DataFrame([1,2,3], columns=['id'])
df1 = pd.DataFrame([[1,1]], columns=['id', 'value'])
df2 = pd.DataFrame([[2,2]], columns=['id', 'value'])
df3 = pd.DataFrame([[3,3]], columns=['id', 'value'])
I would suggest to concat first your dataframe (using a loop if needed):
df = pd.concat([df1, df2, df3])
And then merge:
pd.merge(base, df, on='id')
It yields:
id value
0 1 1
1 2 2
2 3 3
Update
Runing the code with the new version of your question and the input provided by #Celius Stingher:
a = {'id':[1,2,3],'constrains':['a','b','c']}
b = {'id':[1,2,3],'value':[1,2,3],'constrains':['a','a','a']}
c = {'id':[1,2,3],'value':[1,2,3],'constrains':['b','b','b']}
d = {'id':[1,2,3],'value':[1,2,3],'constrains':['c','c','c']}
base = pd.DataFrame(a)
df1 = pd.DataFrame(b)
df2 = pd.DataFrame(c)
df3 = pd.DataFrame(d)
We get:
id constrains value
0 1 a 1
1 2 b 2
2 3 c 3
Which seems to be compliant with your expected output.
You can use ffill() for the purpose:
df_1 = pd.DataFrame({'val':[1]}, index=[1])
df_2 = pd.DataFrame({'val':[2]}, index=[2])
df_3 = pd.DataFrame({'val':[3]}, index=[3])
(pd.concat((df_1,df_2,df_3), axis=1)
.ffill(1)
.iloc[:,-1]
)
Output:
1 1.0
2 2.0
3 3.0
Name: val, dtype: float64
For your new data:
base.merge(pd.concat((df1,df2,df3)),
on=['id','constraint'],
how='left')
output:
id constraint value
0 1 'a' 1
1 2 'b' 2
2 3 'c' 3
Conclusion: you are actually looking for the option how='left' in merge
If you must only merge all dataframes with base:
Based on edit
import pandas as pd
a = {'id':[1,2,3],'constrains':['a','b','c']}
b = {'id':[1,2,3],'value':[1,2,3],'constrains':['a','a','a']}
c = {'id':[1,2,3],'value':[1,2,3],'constrains':['b','b','b']}
d = {'id':[1,2,3],'value':[1,2,3],'constrains':['c','c','c']}
base = pd.DataFrame(a)
df_1 = pd.DataFrame(b)
df_2 = pd.DataFrame(c)
df_3 = pd.DataFrame(d)
dataframes = [df_1,df_2,df_3]
for i in dataframes:
base = base.merge(i,how='left',on=['id','constrains'])
summation = [col for col in base if col.startswith('value')]
base['value'] = base[summation].sum(axis=1)
base = base.dropna(how='any',axis=1)
print(base)
Output:
id constrains value
0 1 a 1.0
1 2 b 2.0
2 3 c 3.0
For those who want to simply do a merge, overriding the values (which is my case), can achieve that using this method, which is really similar to Celius Stingher answer.
Documented version is on the original gist.
import pandas as pa
def rmerge(left,right,**kwargs):
# Function to flatten lists from http://rosettacode.org/wiki/Flatten_a_list#Python
def flatten(lst):
return sum( ([x] if not isinstance(x, list) else flatten(x) for x in lst), [] )
# Set default for removing overlapping columns in "left" to be true
myargs = {'replace':'left'}
myargs.update(kwargs)
# Remove the replace key from the argument dict to be sent to
# pandas merge command
kwargs = {k:v for k,v in myargs.items() if k is not 'replace'}
if myargs['replace'] is not None:
# Generate a list of overlapping column names not associated with the join
skipcols = set(flatten([v for k, v in myargs.items() if k in ['on','left_on','right_on']]))
leftcols = set(left.columns)
rightcols = set(right.columns)
dropcols = list((leftcols & rightcols).difference(skipcols))
# Remove the overlapping column names from the appropriate DataFrame
if myargs['replace'].lower() == 'left':
left = left.copy().drop(dropcols,axis=1)
elif myargs['replace'].lower() == 'right':
right = right.copy().drop(dropcols,axis=1)
df = pa.merge(left,right,**kwargs)
return df
I have a dataframe and want to eliminate duplicate rows, that have same values, but in different columns:
df = pd.DataFrame(columns=['a','b','c','d'], index=['1','2','3'])
df.loc['1'] = pd.Series({'a':'x','b':'y','c':'e','d':'f'})
df.loc['2'] = pd.Series({'a':'e','b':'f','c':'x','d':'y'})
df.loc['3'] = pd.Series({'a':'w','b':'v','c':'s','d':'t'})
df
Out[8]:
a b c d
1 x y e f
2 e f x y
3 w v s t
Rows [1],[2] have the values {x,y,e,f}, but they are arranged in a cross - i.e. if you would exchange columns c,d with a,b in row [2] you would have a duplicate.
I want to drop these lines and only keep one, to have the final output:
df_new
Out[20]:
a b c d
1 x y e f
3 w v s t
How can I efficiently achieve that?
I think you need filter by boolean indexing with mask created by numpy.sort with duplicated, for invert it use ~:
df = df[~pd.DataFrame(np.sort(df, axis=1), index=df.index).duplicated()]
print (df)
a b c d
1 x y e f
3 w v s t
Detail:
print (np.sort(df, axis=1))
[['e' 'f' 'x' 'y']
['e' 'f' 'x' 'y']
['s' 't' 'v' 'w']]
print (pd.DataFrame(np.sort(df, axis=1), index=df.index))
0 1 2 3
1 e f x y
2 e f x y
3 s t v w
print (pd.DataFrame(np.sort(df, axis=1), index=df.index).duplicated())
1 False
2 True
3 False
dtype: bool
print (~pd.DataFrame(np.sort(df, axis=1), index=df.index).duplicated())
1 True
2 False
3 True
dtype: bool
Here's another solution, with a for loop:
data = df.as_matrix()
new = []
for row in data:
if not new:
new.append(row)
else:
if not any([c in nrow for nrow in new for c in row]):
new.append(row)
new_df = pd.DataFrame(new, columns=df.columns)
Use sorting(np.sort) and then get duplicates(.duplicated()) out of it.
Later use that duplicates to drop(df.drop) the required index
import pandas as pd
import numpy as np
df = pd.DataFrame(columns=['a','b','c','d'], index=['1','2','3'])
df.loc['1'] = pd.Series({'a':'x','b':'y','c':'e','d':'f'})
df.loc['2'] = pd.Series({'a':'e','b':'f','c':'x','d':'y'})
df.loc['3'] = pd.Series({'a':'w','b':'v','c':'s','d':'t'})
df_duplicated = pd.DataFrame(np.sort(df, axis=1), index=df.index).duplicated()
index_to_drop = [ind for ind in range(len(df_duplicated)) if df_duplicated[ind]]
df.drop(df.index[df_duplicated])
If i have data like
Col1
A
B
A
B
A
C
I need output like
Col_value Count
A 3
B 2
C 1
I need to col_value and count be column names.
So I can access it like a['col_value']
Use value_counts:
df = pd.value_counts(df.Col1).to_frame().reset_index()
df
A 3
B 2
C 1
then rename your columns if needed:
df.columns = ['Col_value','Count']
df
Col_value Count
0 A 3
1 B 2
2 C 1
Another solution is groupby with aggregating size:
df = df.groupby('Col1')
.size()
.reset_index(name='Count')
.rename(columns={'Col1':'Col_value'})
print (df)
Col_value Count
0 A 3
1 B 2
2 C 1
Use pd.crosstab as another alternative:
import pandas as pd
help(pd.crosstab)
Help on function crosstab in module pandas.core.reshape.pivot:
crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False)
Example:
df_freq = pd.crosstab(df['Col1'], columns='count')
df_freq.head()
def frequencyTable(alist):
'''
list -> chart
Returns None. Side effect is printing two columns showing each number that
is in the list, and then a column indicating how many times it was in the list.
Example:
>>> frequencyTable([1, 3, 3, 2])
ITEM FREQUENCY
1 1
2 1
3 2
'''
countdict = {}
for item in alist:
if item in countdict:
countdict[item] = countdict[item] + 1
else:
countdict[item] = 1
itemlist = list(countdict.keys())
itemlist.sort()
print("ITEM", "FREQUENCY")
for item in itemlist:
print(item, " ", countdict[item])
return None