pandas loop to run multiple cross tabs - python

I have the following dataset and I want to compute all possible combinations of cross-tabulations in the most efficient way, I have been able to calculate pairs against one master variable, but not for all possibilities (i have popped what i mean in below). Is there a way to get this all in loop that could handle any number of columns? Thanks so much!
data
import pandas as pd
df1 = pd.DataFrame(data={'id': [1,2,3,4,5,6,7,8,9,10],
'a': [1,1,2,2,2,1,1,2,1,1],
'b': [1,2,3,3,3,2,1,2,3,1],
'c': [1,2,2,1,1,1,1,2,1,2],
'd': [1,1,2,2,1,1,1,1,1,2],
})
d1={1: 'right', 2: 'left'}
d2={1: '10', 2: '30', 3: '20'}
d3={1: 'green', 2: 'red'}
d4={1: 'yes', 2: 'no'}
df1['a']=df1['a'].map(d1).fillna('Other')
df1['b']=df1['b'].map(d2).fillna('Other')
df1['c']=df1['c'].map(d3).fillna('Other')
df1['d']=df1['d'].map(d4).fillna('Other')
combinations
pd.crosstab(df1.a, df1.b)
pd.crosstab(df1.a, df1.c)
pd.crosstab(df1.a, df1.d)
pd.crosstab(df1.b, df1.c)
pd.crosstab(df1.b, df1.d)
pd.crosstab(df1.c, df1.d)
pd.crosstab(df1.a, [df1.b, df1.c])
pd.crosstab(df1.a, [df1.b, df1.d])
pd.crosstab(df1.a, [df1.c, df1.d])
pd.crosstab(df1.a, [df1.b, df1.c, df1.d])
what I have so far
def cross_tab(data_frame, id_col):
col_names=['b','c','d']
datasets = {}
for i in col_names:
datasets['crosstab_{}'.format(i)] = pd.crosstab(data_frame[id_col], data_frame[i])
return datasets
cross_tab(df1, 'a')
EDIT
slightly edited request now separate to cross tabulation - to split output based on whether the table includes a specific value, in this case, dfs (a) with a value of 100 should be stored in a separate list to the rest (b and c)
data
import pandas as pd
import numpy as np
df1 = pd.DataFrame(data={
'a': [1,1,1,1],
'b': [1,1,2,1],
'c': [1,2,2,1]
})
d1={0: 'right', 1: 'left'}
d2={1: 'yes', 2: 'no'}
d3={1: 'up', 2: 'down', 3: 'sideways'}
#d4={1: 'yes', 2: 'no'}
df1['a']=df1['a'].map(d1).fillna('Other')
df1['b']=df1['b'].map(d2).fillna('Other')
df1['c']=df1['c'].map(d3).fillna('Other')
command solved (i think)
def split_cross_tabs(dataframe, cols, contain_val):
datasets = defaultdict(dict)
for x in df1:
p = df1[x].value_counts(normalize=True)*100
datasets[
'y' if p.eq(contain_val).any().any() else'n']['crosstab_{}'.format(x)] = p
return datasets
output
defaultdict(dict,
{'y': {'crosstab_a': left 100.0
Name: a, dtype: float64},
'n': {'crosstab_b': yes 75.0
no 25.0
Name: b, dtype: float64,
'crosstab_c': down 50.0
up 50.0
Name: c, dtype: float64}})

Try with the itertools recipe for a powerset and modify to only keep combinations of length 2 or greater:
from itertools import chain, combinations
def all_cross_tabs(dataframe, cols):
datasets = {}
for s in chain.from_iterable(
combinations(cols, r) for r in range(2, len(cols) + 1)
):
datasets[f'crosstab_{"_".join(s)}'] = pd.crosstab(
dataframe[s[0]],
[dataframe[c] for c in s[1:]]
)
return datasets
Sample:
d = all_cross_tabs(df1, ['a', 'b', 'c', 'd'])
d.keys():
dict_keys(['crosstab_a_b', 'crosstab_a_c', 'crosstab_a_d', 'crosstab_b_c',
'crosstab_b_d', 'crosstab_c_d', 'crosstab_a_b_c', 'crosstab_a_b_d',
'crosstab_a_c_d', 'crosstab_b_c_d', 'crosstab_a_b_c_d'])
d['crosstab_a_b']:
b 10 20 30
a
left 0 3 1
right 3 1 2
d['crosstab_a_b_c']:
b 10 20 30
c green red green red green red
a
left 0 0 2 1 0 1
right 2 1 1 0 1 1
d['crosstab_a_b_c_d']
b 10 20 30
c green red green red green red
d yes no no yes no yes yes
a
left 0 0 1 1 1 0 1
right 2 1 0 1 0 1 1
Edit: Split into two sections based on contain_val
def split_cross_tabs(dataframe, cols, contain_val):
datasets = defaultdict(dict)
for s in chain.from_iterable(
combinations(cols, r) for r in range(2, len(cols) + 1)
):
ct_df = pd.crosstab(
dataframe[s[0]],
[dataframe[c] for c in s[1:]]
)
datasets[
'y' if ct_df.eq(contain_val).any().any() else 'n'
][f'crosstab_{"_".join(s)}'] = ct_df
return datasets
d = split_cross_tabs(df1, ['a', 'b', 'c', 'd'], 3)
d.keys():
dict_keys(['y', 'n'])
list(map(lambda a: a.keys(), d.values())):
[dict_keys(['crosstab_a_b', 'crosstab_b_c', 'crosstab_b_d']),
dict_keys(['crosstab_a_c', 'crosstab_a_d', 'crosstab_c_d', 'crosstab_a_b_c',
'crosstab_a_b_d', 'crosstab_a_c_d', 'crosstab_b_c_d',
'crosstab_a_b_c_d'])]

Related

Pandas - modifying single/multiple columns with method chaining

I discovered methods chaining in pandas only very recently. I love how it makes the code cleaner and more readable, but I still can't figure out how to use it when I want to modify only a single column, or a group of columns, as part of the pipeline.
For example, let's say this is my DataFrame:
df = pd.DataFrame({
'num_1': [np.nan, 2., 2., 3., 1.],
'num_2': [9., 6., np.nan, 5., 7.],
'str_1': ['a', 'b', 'c', 'a', 'd'],
'str_2': ['C', 'B', 'B', 'D', 'A'],
})
And I have some manipulation I want to do on it:
numeric_cols = ['num_1', 'num_2']
str_cols = ['str_1', 'str_2']
df[numeric_cols] = df[numeric_cols].fillna(0.).astype('int')
df[numeric_cols] = df[numeric_cols] * 2
df['str_2'] = df['str_2'].str.lower()
df[str_cols] = df[str_cols].replace({'a': 'z', 'b':'y', 'c': 'x'})
My question is - what is the most pandas-y way / best practice to achieve all of the above with method chaining?
I went through the documentation of .assign and .pipe, and many answers here, and have gotten as far as this:
def foo_numbers(df):
numeric_cols = ['num_1', 'num_2']
df[numeric_cols] = df[numeric_cols].fillna(0.).astype('int')
df[numeric_cols] = df[numeric_cols] * 2
return df
df = (df
.pipe (foo_numbers)
.assign (str_2=df['str_2'].str.lower())
.replace ({'str_1':to_rep, 'str_2':to_rep})
)
which produces the same output. My problems with this are:
The pipe seems to just hide the handling of the numeric columns from the main chain, but the implementation inside hasn't improved at all.
The .replace requires me to manually name all the columns one by one. What if I have more than just two columns? (You can assume I want to apply the same replacement to all columns).
The .assign is OK, but I was hoping there is a way to pass str.lower as a callable to be applied to that one column, but I couldn't make it work.
So what's the correct way to approach these kind of changes to a DataFrame, using method chaining?
I would do it this way with the help of pandas.select_dtypes and pandas.concat :
import numpy as np
df = (
pd.concat(
[df.select_dtypes(np.number)
.fillna(0)
.astype(int)
.mul(2),
df.select_dtypes('object')
.apply(lambda s: s.str.lower())
.replace({'a':'z', 'b':'y', 'c':'x'})], axis=1)
)
​
Output :
print(df)
num_1 num_2 str_1 str_2
0 0 18 z x
1 4 12 y y
2 4 0 x y
3 6 10 z d
4 2 14 d z
You already have a good approach, except for the fact that you mutate the input. Either make a copy, or chain operations:
def foo_numbers(df):
df = df.copy()
numeric_cols = ['num_1', 'num_2']
df[numeric_cols] = df[numeric_cols].fillna(0, downcast='infer').mul(2)
return df
Or:
def foo_numbers(df):
numeric_cols = ['num_1', 'num_2']
return (df[numeric_cols].fillna(0, downcast='infer').mul(2)
.combine_first(df)
)[df.columns]
Here are some more examples.
Using assign:
numeric_cols = ['num_1', 'num_2']
str_cols = ['str_1', 'str_2']
(df.assign(**{c: lambda d, c=c: d[c].fillna(0, downcast='infer').mul(2)
for c in numeric_cols})
.assign(**{c: lambda d, c=c: d[c].str.lower().replace({'a': 'z', 'b':'y', 'c': 'x'})
for c in str_cols})
)
Using apply:
def foo(s):
if pd.api.types.is_numeric_dtype(s):
return s.fillna(0, downcast='infer').mul(2)
elif s.dtype == object:
return s.str.lower().replace({'a': 'z', 'b':'y', 'c': 'x'})
return s
df.apply(foo)
Using pipe:
def foo(df):
df = df.copy()
df.update(df.select_dtypes('number')
.fillna(0, downcast='infer').mul(2))
df.update(df.select_dtypes(object)
.apply(lambda s: s.str.lower().replace({'a': 'z', 'b':'y', 'c': 'x'}))
)
return df
df.pipe(foo)
One option, with the method chaining:
(df
.loc(axis=1)[numeric_cols]
.fillna(0,downcast='infer')
.mul(2)
.assign(**df.loc(axis=1)[str_cols]
.transform(lambda f: f.str.lower())
.replace({'a':'z', 'b':'y','c':'x'}))
)
num_1 num_2 str_1 str_2
0 0 18 z x
1 4 12 y y
2 4 0 x y
3 6 10 z d
4 2 14 d z
Another option, using pyjanitor's transform_columns:
(df.transform_columns(numeric_cols,
lambda f: f.fillna(0,downcast='infer').mul(2),
elementwise=False)
.transform_columns(str_cols, str.lower)
.replace({'a':'z', 'b':'y','c':'x'})
)
num_1 num_2 str_1 str_2
0 0 18 z x
1 4 12 y y
2 4 0 x y
3 6 10 z d
4 2 14 d z

For and If together - dataframe Python

If the value in column 'y' is K, multiply the column 'x' values to 1e3. If column 'y' is M, multiply the column 'x' values to 1e6. Below code multiplies all the values to with 1e3
value_list = []
for i in list(result['x'].values):
if np.where(result['y'] == 'K'):
value_list.append(float(i)*1e3)
elif np.where(result['y'] == 'M'):
value_list.append(float(i)*1e6)
else:
value_list.append(np.nan)
df['Value_numeric'] = value_list
df.head().Value_numeric
Dataframe:
Output right now:
This case is simple enough that it's not necessary to use a loop or a custom function; one can use a simple assignment:
import pandas as pd
import numpy as np
d = {'x': [750, 5, 4, 240, 220], 'y': ['K', 'M', 'M', 'K', 'K']}
df = pd.DataFrame(data=d)
# here is the main operation:
df['value_numeric'] = np.where(df['y']=='K', df['x'] * 1e3, df['x'] * 1e6)
print(df)
output
x y value_numeric
0 750 K 750000.0
1 5 M 5000000.0
2 4 M 4000000.0
3 240 K 240000.0
4 220 K 220000.0
You can do something like this:
df = pd.DataFrame([[1,"a"],[2,'b'],[3,'c']], columns=['A', 'B'])
def calc(x):
if x['B'] == 'a':
return x['A'] * 10
if x['B'] == 'b':
return x['A'] * 20
if x['B'] == 'c':
return x['A'] * 30
df['calculate'] = df.apply(lambda x: calc(x),axis=1)
print(df)
# A B calculate
#0 1 a 10
#1 2 b 40
#2 3 c 90
You can adjust your calculations as needed based on the condition.

Calculate average of column x if column y meets criteria, for each y

How do I retrieve the value of column Z and its average
if any value are >1
data=[9,2,3,4,5,6,7,8]
df = pd.DataFrame(np.random.randn(8, 5),columns=['A', 'B', 'C', 'D','E'])
fd=pd.DataFrame(data,columns=['Z'])
df=pd.concat([df,fd], axis=1)
l=[]
for x,y in df.iterrows():
for i,s in y.iteritems():
if s >1:
l.append(x)
print(df['Z'])
The expected output will most likely be a dictionary with the column name as key and the average of Z as its values.
Using a dictionary comprehension:
res = {col: df.loc[df[col] > 1, 'Z'].mean() for col in df.columns[:-1]}
# {'A': 9.0, 'B': 5.0, 'C': 8.0, 'D': 7.5, 'E': 6.666666666666667}
Setup used for above:
np.random.seed(0)
data = [9,2,3,4,5,6,7,8]
df = pd.DataFrame(np.random.randn(8, 5),columns=['A', 'B', 'C', 'D','E'])
fd = pd.DataFrame(data, columns=['Z'])
df = pd.concat([df, fd], axis=1)
Do you mean this?
df[df['Z']>1].loc[:,'Z'].mean(axis=0)
or
df[df['Z']>1]['Z'].mean()
I don't know if I understood your question correctly but do you mean this:
import pandas as pd
import numpy as np
data=[9,2,3,4,5,6,7,8]
columns = ['A', 'B', 'C', 'D','E']
df = pd.DataFrame(np.random.randn(8, 5),columns=columns)
fd=pd.DataFrame(data,columns=['Z'])
df=pd.concat([df,fd], axis=1)
print('df = \n', str(df))
anyGreaterThanOne = (df[columns] > 1).any(axis=1)
print('anyGreaterThanOne = \n', str(anyGreaterThanOne))
filtered = df[anyGreaterThanOne]
print('filtered = \n', str(filtered))
Zmean = filtered['Z'].mean()
print('Zmean = ', str(Zmean))
Result:
df =
A B C D E Z
0 -2.170640 -2.626985 -0.817407 -0.389833 0.862373 9
1 -0.372144 -0.375271 -1.309273 -1.019846 -0.548244 2
2 0.267983 -0.680144 0.304727 0.302952 -0.597647 3
3 0.243549 1.046297 0.647842 1.188530 0.640133 4
4 -0.116007 1.090770 0.510190 -1.310732 0.546881 5
5 -1.135545 -1.738466 -1.148341 0.764914 -1.140543 6
6 -2.078396 0.057462 -0.737875 -0.817707 0.570017 7
7 0.187877 0.363962 0.637949 -0.875372 -1.105744 8
anyGreaterThanOne =
0 False
1 False
2 False
3 True
4 True
5 False
6 False
7 False
dtype: bool
filtered =
A B C D E Z
3 0.243549 1.046297 0.647842 1.188530 0.640133 4
4 -0.116007 1.090770 0.510190 -1.310732 0.546881 5
Zmean = 4.5

pandas / dask calculate percentages for multiple columns - column-parallel operation

When I have a data frame in pandas like:
raw_data = {
'subject_id': ['1', '2', '3', '4', '5'],
'name': ['A', 'B', 'C', 'D', 'E'],
'nationality': ['DE', 'AUT', 'US', 'US', 'US'],
'alotdifferent': ['x', 'y', 'z', 'x', 'a'],
'target': [0,0,0,1,1],
'age_group' : [1, 2, 1, 3, 1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
df_a.nationality = df_a.nationality.astype('category')
df_a.alotdifferent = df_a.alotdifferent.astype('category')
df_a.name = df_a.name.astype('category')
Currently, I use:
FACTOR_FIELDS = df_a.select_dtypes(include=['category']).columns
columnsToDrop = ['alotdifferent']
columnsToBias_keep = FACTOR_FIELDS[~FACTOR_FIELDS.isin(columnsToDrop)]
target = 'target'
def quotients_slow(df_a):
# parallelism = 8
# original = dd.from_pandas(df.copy())
original = df_a.copy()
output_df = original
ratio_weights = {}
for colname in columnsToBias_keep.union(columnsToDrop):
# group only a single time
grouped = original.groupby([colname, target]).size()
# calculate first ratio
df = grouped / original[target].sum()
nameCol = "pre_" + colname
grouped_res = df.reset_index(name=nameCol)
grouped_res = grouped_res[grouped_res[target] == 1]
grouped_res = grouped_res.drop(target, 1)
# todo persist the result in dict for transformer
result_1 = grouped_res
# calculate second ratio
df = (grouped / grouped.groupby(level=0).sum())
nameCol_2 = "pre2_" + colname
grouped = df.reset_index(name=nameCol_2)
grouped_res = grouped[grouped[target] == 1]
grouped_res = grouped_res.drop(target, 1)
result_2 = grouped_res
# persist the result in dict for transformer
# this is required to separate fit and transform stage (later on in a sklearn transformer)
ratio_weights[nameCol] = result_1
ratio_weights[nameCol_2] = result_2
# retrieve results
res_1 = ratio_weights['pre_' + colname]
res_2 = ratio_weights['pre2_' + colname]
# merge ratio_weight with original dataframe
output_df = pd.merge(output_df, res_1, on=colname, how='left')
output_df = pd.merge(output_df, res_2, on=colname, how='left')
output_df.loc[(output_df[nameCol].isnull()), nameCol] = 0
output_df.loc[(output_df[nameCol_2].isnull()), nameCol_2] = 0
if colname in columnsToDrop:
output_df = output_df.drop(colname, 1)
return output_df
quotients_slow(df_a)
to calculate the ratio of each group to target:1 for each (categorical) column in two ways. As I want to perform the this operation for multiple columns, I naively iterating all of them. But this operation is very slow.
Here in the sample: 10 loops, best of 3: 37 ms per loop. For my real dataset of around 500000 rows and around 100 columns this really takes a while.
Shouldn't it be possible to speed it up (column parallel manner, trivial parallelization) in either dask or pandas? Is there a possibility to implement it more efficiently in plain pandas? Is it possible to reduce the number of passes over the data for computing the quotients?
edit
when trying to use dask.delayed in the for loop to achieve parallelism over the columns, I can't figure out how to build the graph over the columns, as I need to call compute to get the tuples.
delayed_res_name = delayed(compute_weights)(df_a, 'name')
a,b,c,d = delayed_res_name.compute()
ratio_weights = {}
ratio_weights[c] = a
ratio_weights[d] = b
Here's a reasonably fast solution for your first quotient, using Pandas. It assumes you are not interested in computing proportions for subject_id. I also added some data to your example to cover more edge cases.
First, generate sample data:
raw_data = {
'subject_id': ['1', '2', '3', '4', '5', '6','7'],
'name': ['A', 'B', 'C', 'D', 'E', 'A','A'],
'nationality': ['DE', 'AUT', 'US', 'US', 'US', 'DE','DE'],
'alotdifferent': ['x', 'y', 'z', 'x', 'a','x','z'],
'target': [0,0,0,1,1,0,1],
'age_group' : [1, 2, 1, 3, 1, 2,1]}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'name', 'nationality', 'alotdifferent','target','age_group'])
Now compute proportions and measure speed:
def compute_prop(group):
return group.sum() / float(group.count())
def build_master(df):
master = df.copy()
fields = df.drop(['subject_id','target'],1).columns
for field in fields:
master = (pd.merge(master, df.groupby(field, as_index=False)
.agg({'target':compute_prop})
.rename(columns={'target':'pre_{}'.format(field)}),
on=field)
)
master.sort_values('subject_id')
return master
%timeit master = build_master(df_a)
10 loops, best of 3: 17.1 ms per loop
Output:
subject_id name nationality alotdifferent target age_group pre_name \
0 1 A DE x 0 1 0.333333
5 2 B AUT y 0 2 0.000000
2 3 C US z 0 1 0.000000
6 4 D US x 1 3 1.000000
3 5 E US a 1 1 1.000000
4 6 A DE x 0 2 0.333333
1 7 A DE z 1 1 0.333333
pre_nationality pre_alotdifferent pre_age_group
0 0.333333 0.333333 0.5
5 0.000000 0.000000 0.0
2 0.666667 0.500000 0.5
6 0.666667 0.333333 1.0
3 0.666667 1.000000 0.5
4 0.333333 0.333333 0.0
1 0.333333 0.500000 0.5

Automatically rename columns to ensure they are unique

I fetch a spreadsheet into a Python DataFrame named df.
Let's give a sample:
df=pd.DataFrame({'a': np.random.rand(10), 'b': np.random.rand(10)})
df.columns=['a','a']
a a
0 0.973858 0.036459
1 0.835112 0.947461
2 0.520322 0.593110
3 0.480624 0.047711
4 0.643448 0.104433
5 0.961639 0.840359
6 0.848124 0.437380
7 0.579651 0.257770
8 0.919173 0.785614
9 0.505613 0.362737
When I run df.columns.is_unique I get False
I would like to automatically rename column 'a' to 'a_2' (or things like that)
I don't expect a solution like df.columns=['a','a_2']
I looking for a solution that could be usable for several columns!
You can uniquify the columns manually:
df_columns = ['a', 'b', 'a', 'a_2', 'a_2', 'a', 'a_2', 'a_2_2']
def uniquify(df_columns):
seen = set()
for item in df_columns:
fudge = 1
newitem = item
while newitem in seen:
fudge += 1
newitem = "{}_{}".format(item, fudge)
yield newitem
seen.add(newitem)
list(uniquify(df_columns))
#>>> ['a', 'b', 'a_2', 'a_2_2', 'a_2_3', 'a_3', 'a_2_4', 'a_2_2_2']
I fetch a spreadsheet into a Python DataFrame named df... I would like
to automatically rename [duplicate] column [names].
Pandas does that automatically for you without you having to do anything...
test.xls:
import pandas as pd
import numpy as np
df = pd.io.excel.read_excel(
"./test.xls",
"Sheet1",
header=0,
index_col=0,
)
print df
--output:--
a b c b.1 a.1 a.2
index
0 10 100 -10 -100 10 21
1 20 200 -20 -200 11 22
2 30 300 -30 -300 12 23
3 40 400 -40 -400 13 24
4 50 500 -50 -500 14 25
5 60 600 -60 -600 15 26
print df.columns.is_unique
--output:--
True
If for some reason you are being given a DataFrame with duplicate columns, you can do this:
import pandas as pd
import numpy as np
from collections import defaultdict
df = pd.DataFrame(
{
'k': np.random.rand(10),
'l': np.random.rand(10),
'm': np.random.rand(10),
'n': np.random.rand(10),
'o': np.random.rand(10),
'p': np.random.rand(10),
}
)
print df
--output:--
k l m n o p
0 0.566150 0.025225 0.744377 0.222350 0.800402 0.449897
1 0.701286 0.182459 0.661226 0.991143 0.793382 0.980042
2 0.383213 0.977222 0.404271 0.050061 0.839817 0.779233
3 0.428601 0.303425 0.144961 0.313716 0.244979 0.487191
4 0.187289 0.537962 0.669240 0.096126 0.242258 0.645199
5 0.508956 0.904390 0.838986 0.315681 0.359415 0.830092
6 0.007256 0.136114 0.775670 0.665000 0.840027 0.991058
7 0.719344 0.072410 0.378754 0.527760 0.205777 0.870234
8 0.255007 0.098893 0.079230 0.225225 0.490689 0.554835
9 0.481340 0.300319 0.649762 0.460897 0.488406 0.16604
df.columns = ['a', 'b', 'c', 'b', 'a', 'a']
print df
--output:--
a b c b a a
0 0.566150 0.025225 0.744377 0.222350 0.800402 0.449897
1 0.701286 0.182459 0.661226 0.991143 0.793382 0.980042
2 0.383213 0.977222 0.404271 0.050061 0.839817 0.779233
3 0.428601 0.303425 0.144961 0.313716 0.244979 0.487191
4 0.187289 0.537962 0.669240 0.096126 0.242258 0.645199
5 0.508956 0.904390 0.838986 0.315681 0.359415 0.830092
6 0.007256 0.136114 0.775670 0.665000 0.840027 0.991058
7 0.719344 0.072410 0.378754 0.527760 0.205777 0.870234
8 0.255007 0.098893 0.079230 0.225225 0.490689 0.554835
9 0.481340 0.300319 0.649762 0.460897 0.488406 0.166047
print df.columns.is_unique
--output:--
False
name_counts = defaultdict(int)
new_col_names = []
for name in df.columns:
new_count = name_counts[name] + 1
new_col_names.append("{}{}".format(name, new_count))
name_counts[name] = new_count
print new_col_names
--output:--
['a1', 'b1', 'c1', 'b2', 'a2', 'a3']
df.columns = new_col_names
print df
--output:--
a1 b1 c1 b2 a2 a3
0 0.264598 0.321378 0.466370 0.986725 0.580326 0.671168
1 0.938810 0.179999 0.403530 0.675112 0.279931 0.011046
2 0.935888 0.167405 0.733762 0.806580 0.392198 0.180401
3 0.218825 0.295763 0.174213 0.457533 0.234081 0.555525
4 0.891890 0.196245 0.425918 0.786676 0.791679 0.119826
5 0.721305 0.496182 0.236912 0.562977 0.249758 0.352434
6 0.433437 0.501975 0.088516 0.303067 0.916619 0.717283
7 0.026491 0.412164 0.787552 0.142190 0.665488 0.488059
8 0.729960 0.037055 0.546328 0.683137 0.134247 0.444709
9 0.391209 0.765251 0.507668 0.299963 0.348190 0.731980
print df.columns.is_unique
--output:--
True
In case anyone needs this in Scala->
def renameDup (Header : String) : String = {
val trimmedList: List[String] = Header.split(",").toList
var fudge =0
var newitem =""
var seen = List[String]()
for (item <- trimmedList){
fudge = 1
newitem = item
for (newitem2 <- seen){
if (newitem2 == newitem ){
fudge += 1
newitem = item + "_" + fudge
}
}
seen= seen :+ newitem
}
return seen.mkString(",")
}
>>> ['a', 'b', 'a_2', 'a_2_2', 'a_2_3', 'a_3', 'a_2_4', 'a_2_2_2']
Here's a solution that uses pandas all the way through.
import pandas as pd
# create data frame with duplicate column names
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
df.rename({'a': 'col', 'b': 'col'}, axis=1, inplace=True)
df
---output---
col col
0 1 4
1 2 5
2 3 6
# make a new data frame of column headers and number sequentially
dfcolumns = pd.DataFrame({'name': df.columns})
dfcolumns['counter'] = dfcolumns.groupby('name').cumcount().apply(str)
# remove counter for first case (optional) and combine suffixes
dfcolumns.loc[dfcolumns.counter=='0', 'counter'] = ''
df.columns = dfcolumns['name'] + dfcolumns['counter']
df
---output---
col col1
0 1 4
1 2 5
2 3 6
I ran into this problem when loading DataFrames from oracle tables. 7stud is right that pd.read_excel() automatically designates duplicated columns with a *.1, but not all of the read functions do this. One work around is to save the DataFrame to a csv (or excel) file and then reload it to re-designate duplicated columns.
data = pd.read_SQL(SQL,connection)
data.to_csv(r'C:\temp\temp.csv')
data=read_csv(r'C:\temp\temp.csv')

Categories

Resources