xyarr= [[0,1,2],[1,1,3],[2,1,2]]
df1 = pd.DataFrame(xyarr, columns=['a', 'b','c'])
df2 = pd.DataFrame([['text','text2']], columns=['x','y'])
df3 = pd.concat([df1,df2],axis=0, ignore_index=True)
df3 will have NaN values, from the empty columns a b c.
a b c x y
0 0.0 1.0 2.0 NaN NaN
1 1.0 1.0 3.0 NaN NaN
2 2.0 1.0 2.0 NaN NaN
3 NaN NaN NaN text text2
I want to save df3 to a csv, but without the extra commas
any suggestions?
As pd.concat is an outer join by default, you will get the NaN values from the empty columns a b c. If you use other Pandas function e.g. .join() which is left join by default, you can get around the problem here.
You can try using .join(), as follows:
df3 = df1.join(df2)
Result:
print(df3)
a b c x y
0 0 1 2 text text2
1 1 1 3 NaN NaN
2 2 1 2 NaN NaN
how to fill df with empty rows or create a df with empty rows.
have df :
df = pd.DataFrame(columns=["naming","type"])
how to fill this df with empty rows
Specify index values:
df = pd.DataFrame(columns=["naming","type"], index=range(10))
print (df)
naming type
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
5 NaN NaN
6 NaN NaN
7 NaN NaN
8 NaN NaN
9 NaN NaN
If need empty strings:
df = pd.DataFrame('',columns=["naming","type"], index=range(10))
print (df)
naming type
0
1
2
3
4
5
6
7
8
9
Suppose I have a dataframe:
a b c
0 1 2 NaN
1 2 NaN 4
3 Nan 4 NaN
I want to check for NaN in only some particular column's and want the resulting dataframe as:
a b c
0 1 2 NaN
3 Nan 4 NaN
Here I want to check for NaN in only Column 'a' and Column 'c'.
How this can be done?
You could do that with isnull and any methods:
In [264]: df
Out[264]:
a b c
0 1 2 NaN
1 2 NaN 4
2 NaN 4 NaN
In [265]: df[df.isnull().any(axis=1)]
Out[265]:
a b c
0 1 2 NaN
2 NaN 4 NaN
Note: if you just want clear rows without any NaN you could use dropna method
EDIT
If you want to subset your dataframe you could use mask with your columns and apply it to the whole dataframe:
df_subset = df[['a', 'c']]
In [282]: df[df_subset.isnull().any(axis=1)]
Out[282]:
a b c
0 1 2 NaN
2 NaN 4 NaN
I have a pandas dataframe with two id variables:
df = pd.DataFrame({'id': [1,1,1,2,2,3],
'num': [10,10,12,13,14,15],
'q': ['a', 'b', 'd', 'a', 'b', 'z'],
'v': [2,4,6,8,10,12]})
id num q v
0 1 10 a 2
1 1 10 b 4
2 1 12 d 6
3 2 13 a 8
4 2 14 b 10
5 3 15 z 12
I can pivot the table with:
df.pivot('id','q','v')
And end up with something close:
q a b d z
id
1 2 4 6 NaN
2 8 10 NaN NaN
3 NaN NaN NaN 12
However, what I really want is (the original unmelted form):
id num a b d z
1 10 2 4 NaN NaN
1 12 NaN NaN 6 NaN
2 13 8 NaN NaN NaN
2 14 NaN 10 NaN NaN
3 15 NaN NaN NaN 12
In other words:
'id' and 'num' my indices (normally, I've only seen either 'id' or 'num' being the index but I need both since I'm trying to retrieve the original unmelted form)
'q' are my columns
'v' are my values in the table
Update
I found a close solution from Wes McKinney's blog:
df.pivot_table(index=['id','num'], columns='q')
v
q a b d z
id num
1 10 2 4 NaN NaN
12 NaN NaN 6 NaN
2 13 8 NaN NaN NaN
14 NaN 10 NaN NaN
3 15 NaN NaN NaN 12
However, the format is not quite the same as what I want above.
You could use set_index and unstack
In [18]: df.set_index(['id', 'num', 'q'])['v'].unstack().reset_index()
Out[18]:
q id num a b d z
0 1 10 2.0 4.0 NaN NaN
1 1 12 NaN NaN 6.0 NaN
2 2 13 8.0 NaN NaN NaN
3 2 14 NaN 10.0 NaN NaN
4 3 15 NaN NaN NaN 12.0
You're really close slaw. Just rename your column index to None and you've got what you want.
df2 = df.pivot_table(index=['id','num'], columns='q')
df2.columns = df2.columns.droplevel().rename(None)
df2.reset_index().fillna("null").to_csv("test.csv", sep="\t", index=None)
Note that the the 'v' column is expected to be numeric by default so that it can be aggregated. Otherwise, Pandas will error out with:
DataError: No numeric types to aggregate
To resolve this, you can specify your own aggregation function by using a custom lambda function:
df2 = df.pivot_table(index=['id','num'], columns='q', aggfunc= lambda x: x)
you can remove name q.
df1.columns=df1.columns.tolist()
Zero's answer + remove q =
df1 = df.set_index(['id', 'num', 'q'])['v'].unstack().reset_index()
df1.columns=df1.columns.tolist()
id num a b d z
0 1 10 2.0 4.0 NaN NaN
1 1 12 NaN NaN 6.0 NaN
2 2 13 8.0 NaN NaN NaN
3 2 14 NaN 10.0 NaN NaN
4 3 15 NaN NaN NaN 12.0
This might work just fine:
Pivot
df2 = (df.pivot_table(index=['id', 'num'], columns='q', values='v')).reset_index())
Concatinate the 1st level column names with the 2nd
df2.columns =[s1 + str(s2) for (s1,s2) in df2.columns.tolist()]
Came up with a close solution
df2 = df.pivot_table(index=['id','num'], columns='q')
df2.columns = df2.columns.droplevel()
df2.reset_index().fillna("null").to_csv("test.csv", sep="\t", index=None)
Still can't figure out how to drop 'q' from the dataframe
It can be done in three steps:
#1: Prepare auxilary column 'id_num':
df['id_num'] = df[['id', 'num']].apply(tuple, axis=1)
df = df.drop(columns=['id', 'num'])
#2: 'pivot' is almost an inverse of melt:
df, df.columns.name = df.pivot(index='id_num', columns='q', values='v').reset_index(), ''
#3: Bring back 'id' and 'num' columns:
df['id'], df['num'] = zip(*df['id_num'])
df = df.drop(columns=['id_num'])
This is a result, but with different order of columns:
a b d z id num
0 2.0 4.0 NaN NaN 1 10
1 NaN NaN 6.0 NaN 1 12
2 8.0 NaN NaN NaN 2 13
3 NaN 10.0 NaN NaN 2 14
4 NaN NaN NaN 12.0 3 15
Alternatively with proper order:
def multiindex_pivot(df, columns=None, values=None):
#inspired by: https://github.com/pandas-dev/pandas/issues/23955
names = list(df.index.names)
df = df.reset_index()
list_index = df[names].values
tuples_index = [tuple(i) for i in list_index] # hashable
df = df.assign(tuples_index=tuples_index)
df = df.pivot(index="tuples_index", columns=columns, values=values)
tuples_index = df.index # reduced
index = pd.MultiIndex.from_tuples(tuples_index, names=names)
df.index = index
df = df.reset_index() #me
df.columns.name = '' #me
return df
df = df.set_index(['id', 'num'])
df = multiindex_pivot(df, columns='q', values='v')
I am concatenating two data files using Pandas. The concat is working well but when I write the data back to csv the data loses some coherency:
# Define DataFrame 1
headerList1 = ['A', 'B', 'C', 'D']
b1 = np.array([[0, 'B_foo', 2, 'D_one'],
[3, 'B_bar', 5, 'D_two'],
[6, 'B_cat', 8, 'D_one']])
df1 = pd.DataFrame(b1, columns=headerList1)
# Define DataFrame 2
headerList2 = ['C', 'E', 'F', 'G']
b2 = np.array([[12, 'E_foo', 2, 'G_one'],
[15, 'E_bar', 5, 'G_two'],
[19, 'E_cat', 8, 'G_one']])
df2 = pd.DataFrame(b2, columns=headerList2)
# Concat DataFrames
df3 = pd.concat([df1, df2], axis=0, ignore_index=True)
# Write to csv
scratchFile = os.path.join(dir, 'scratch.csv')
df3.to_csv(scratchFile, index_label=False, ignore_index=True)
I am looking for:
A B C D E F G
0 B_foo 2 D_one NaN NaN NaN
3 B_bar 5 D_two NaN NaN NaN
6 B_cat 8 D_one NaN NaN NaN
NaN NaN 12 NaN E_foo 2 G_one
NaN NaN 15 NaN E_bar 5 G_two
NaN NaN 19 NaN E_cat 8 G_one
but get:
A B C D E F G
0 0 B_foo 2 D_one Nan Nan
1 3 B_bar 5 D_two Nan Nan
2 6 B_cat 8 D_one Nan Nan
3 Nan Nan 12 Nan E_foo 2 G_one
4 Nan Nan 15 Nan E_bar 5 G_two
5 Nan Nan 19 Nan E_cat 8 G_one
I can almost reach the desired result by removing index_label=False from the to_csv() command but this results in the addition of an undesired index column.
Is there a way to get the desired output without the index column? Also, of personal interest, why does removing the index_label=False disrupt the column organization?
Thanks!
df3.to_csv('df3.csv', index = False)
This worked for me. index = False means that the dataframe index is not included in the csv.