Merge 3 or more dataframes - python

I'am trying to merge 3 dataframes by index however so far unsuccessfully.
Here is the code:
import pandas as pd
from functools import reduce
#identifying csvs
x='/home/'
csvpaths = ("Data1.csv", "Data2.csv", "Data3.csv")
dfs = list() # an empty list
#creating dataframes based on number of csvs
for i in range (len(csvpaths)):
dfs.append(pd.read_csv(str(x)+ csvpaths[i],index_col=0))
print(dfs[1])
#creating suffix for each dataframe's columns
S=[]
for y in csvpaths:
s=str(y).split('.csv')[0]
S.append(s)
print(S)
#merging attempt
dfx = lambda a,b: pd.merge(a,b,on='SHIP_ID',suffixes=(S)), dfs
print(dfx)
print(dfx.columns)
if i try to export it as csv i get an error as follows(similar error when i try to print dfx.columns):
'tuple' object has no attribute 'to_csv'
the output i want is merger of the 3 dataframes as follows(with respective suffixes), please help.
[Note:table below is very simplified,original table consists of dozens of columns and thousands of rows, hence require practical merging method]

Try:
for s,el in zip(suffixes, dfs):
el.columns=[str(col)+s for col in el.columns]
dfx=pd.concat(dfs, ignore_index=True, sort=False, axis=1)
For the test case I used:
import pandas as pd
dfs=[pd.DataFrame({"x": [1,2,7], "y": list("ghi")}), pd.DataFrame({"x": [5,6], "z": [4,4]}), pd.DataFrame({"x": list("acgjksd")})]
suffixes=["_1", "_2", "_3"]
for s,el in zip(suffixes, dfs):
el.columns=[str(col)+s for col in el.columns]
>>> pd.concat(dfs, ignore_index=True, sort=False, axis=1)
x_1 y_1 x_2 z_2 x_3
0 1.0 g 5.0 4.0 a
1 2.0 h 6.0 4.0 c
2 7.0 i NaN NaN g
3 NaN NaN NaN NaN j
4 NaN NaN NaN NaN k
5 NaN NaN NaN NaN s
6 NaN NaN NaN NaN d
Edit:
for s,el in zip(suffixes, dfs):
el.columns=[str(col)+s for col in el.columns]
el.set_index('ID', inplace=True)
dfx=pd.concat(dfs, ignore_index=False, sort=False, axis=1).reset_index()

Related

Pivot table reindexing in pandas

Having a dataframe as below:
df1 = pd.DataFrame({'Name1':['A','Q','A','B','B','C','C','C','E','E','E'],
'Name2':['B','C','D','C','D','D','A','B','A','B','C'],'Marks2':[10,20,6,50, 88,23,140,9,60,65,70]})
df1
#created a new frame
new=df1.loc[(df1['Marks2'] <= 50)]
new
#created a pivot table
temp=new.pivot_table(index="Name1", columns="Name2", values="Marks2")
temp
I tried to re-index the pivot table.
new_value=['E']
order = new_value+list(temp.index.difference(new_value))
matrix=temp.reindex(index=order, columns=order)
matrix
But the values related to 'E' is not present in pivot table. dataframe df1 contains values related with E. I need to add the value related to E in the pivot_table
Expected output:
Based on the comments my understanding of the intended result:
E A B C D
E NaN 60.0 65.0 70.0 NaN
A NaN NaN 10.0 NaN 6.0
C NaN NaN 9.0 NaN 23.0
Q NaN NaN NaN 20.0 NaN
Code:
Activate the inlcuded #print() statements to see what the steps do.
Especially at the header 'formatting' in the end you may adapt acc. your needs.
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'Name1':['A','Q','A','B','B','C','C','C','E','E','E'],
'Name2':['B','C','D','C','D','D','A','B','A','B','C'],
'Marks2':[10,20,6,50, 88,23,140,9,60,65,70]})
df1['Marks2'] = np.where( (df1['Marks2'] >= 50) & (df1['Name1'] != 'E'),
np.nan, df1['Marks2'])
#print(df1)
temp=df1.pivot_table(index="Name1", columns="Name2", values="Marks2")
#print(temp)
name1_to_move = 'E'
# build new index with name1_to_move at the start (top in df idx)
idx=temp.index.tolist()
idx.pop(idx.index(name1_to_move))
idx.insert(0, name1_to_move)
# moving the row to top by reindex
temp=temp.reindex(idx)
#print(temp)
temp.insert(loc=0, column=name1_to_move, value=np.nan)
#print(temp)
temp.index.name = None
#print(temp)
temp = temp.rename_axis(None, axis=1)
print(temp)

Merge two data frames

I tried two merge two data frames by adding the first line of the second df to the first line of the first df. I also tried to concatenate them but eiter failed.
The format of the Data is
1,3,N0128,Durchm.,5.0,0.1,5.0760000000000005,0.076,-----****--
2,0.000,,,,,,,
3,3,N0129,Position,62.2,0.376,62.238,0.136,***---
4,76.1,-36.000,0.300,-36.057,,,,
5,2,N0130,Durchm.,5.0,0.1,5.067,0.067,-----***---
6,0.000,,,,,,,
The expected format of the output should be
1,3,N0128,Durchm.,5.0,0.1,5.0760000000000005,0.076,-----****--,0.000,,,,,,,
2,3,N0129,Position,62.2,0.376,62.238,0.136,***---**,76.1,-36.000,0.300,-36.057,,,,
3,N0130,Durchm.,5.0,0.1,5.067,0.067,-----***---,0.000,,,,,,,
I already splitted the dataframe from above into two frames. The first one contains only the odd indexes and the second one the even one's.
My problem is now, to merge/concatenate the two frames, by adding the first row of the second df to the first row of the first df. I already tried some methods of merging/concatenating but all of them failed. All the print functions are not neccessary, I only use them to have a quick overview in the console.
The code which I felt most comfortable with is:
os.chdir(output)
csv_files = os.listdir('.')
for csv_file in (csv_files):
if csv_file.endswith(".asc.csv"):
df = pd.read_csv(csv_file)
keep_col = ['Messpunkt', 'Zeichnungspunkt', 'Eigenschaft', 'Position', 'Sollmass','Toleranz','Abweichung','Lage']
new_df = df[keep_col]
new_df = new_df[~new_df['Messpunkt'].isin(['**Teil'])]
new_df = new_df[~new_df['Messpunkt'].isin(['**KS-Oben'])]
new_df = new_df[~new_df['Messpunkt'].isin(['**KS-Unten'])]
new_df = new_df[~new_df['Messpunkt'].isin(['**N'])]
print(new_df)
new_df.to_csv(output+csv_file)
df1 = new_df[new_df.index % 2 ==1]
df2 = new_df[new_df.index % 2 ==0]
df1.reset_index()
df2.reset_index()
print (df1)
print (df2)
merge_df = pd.concat([df1,df2], axis=1)
print (merge_df)
merge_df.to_csv(output+csv_file)
I highly appreciate some help.
With this code, the output is:
1,3,N0128,Durchm.,5.0,0.1,5.0760000000000005,0.076,-----****--,,,,,,,,
2,,,,,,,,,0.000,,,,,,,
3,3,N0129,Position,62.2,0.376,62.238,0.136,***---,,,,,,,,
4,,,,,,,,,76.1,-36.000,0.300,-36.057,,,,
5,2,N0130,Durchm.,5.0,0.1,5.067,0.067,-----***---,,,,,,,,
6,,,,,,,,,0.000,,,,,,,
I get expected result when I use reset_index() to have the same index in both DataFrames.
It may need also drop=True to skip index as new column
pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
Minimal working example.
I use io only to simulate file in memory.
text = '''1,3,N0128,Durchm.,5.0,0.1,5.0760000000000005,0.076,-----****--
2,0.000,,,,,,,
3,3,N0129,Position,62.2,0.376,62.238,0.136,***---
4,76.1,-36.000,0.300,-36.057,,,,
5,2,N0130,Durchm.,5.0,0.1,5.067,0.067,-----***---
6,0.000,,,,,,,'''
import pandas as pd
import io
pd.options.display.max_columns = 20 # to display all columns
df = pd.read_csv(io.StringIO(text), header=None, index_col=0)
#print(df)
df1 = df[df.index % 2 == 1] # .reset_index(drop=True)
df2 = df[df.index % 2 == 0] # .reset_index(drop=True)
#print(df1)
#print(df2)
merge_df = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
print(merge_df)
Result:
1 2 3 4 5 6 7 8 1 2 3 4 5 6 7 8
0 3.0 N0128 Durchm. 5.0 0.100 5.076 0.076 -----****-- 0.0 NaN NaN NaN NaN NaN NaN NaN
1 3.0 N0129 Position 62.2 0.376 62.238 0.136 ***--- 76.1 -36.000 0.300 -36.057 NaN NaN NaN NaN
2 2.0 N0130 Durchm. 5.0 0.100 5.067 0.067 -----***--- 0.0 NaN NaN NaN NaN NaN NaN NaN
EDIT:
It may need
merge_df.index = merge_df.index + 1
to correct index.

insert missing rows in df with dictionary values

Hello I have the following dataframe
df = pd.DataFrame(data={'grade_1':['A','B','C'],
'grade_1_count': [19,28,32],
'grade_2': ['pass','fail',np.nan],
'grade_2_count': [39,18, np.nan]})
whereby some grades as missing, and need to be inserted in to the grade_n column according to the values in this dictionary
grade_dict = {'grade_1':['A','B','C','D','E','F'],
'grade_2' : ['pass','fail','not present', 'borderline']}
and the corresponding row value in the _count column should be filled with np.nan
so the expected output is like this
expected_df = pd.DataFrame(data={'grade_1':['A','B','C','D','E','F'],
'grade_1_count': [19,28,32,0,0,0],
'grade_2': ['pass','fail','not preset','borderline', np.nan, np.nan],
'grade_2_count': [39,18,0,0,np.nan,np.nan]})
so far I have this rather inelegant code that creates a column that includes all the correct categories for the grades, but i cannot reinsert it in to the dataframe, or fill the count columns with zeros (where the np.nans just reflect empty cells due to coercing columns with different lengths of rows) I hope that makes sense. any advice would be great. thanks
x=[]
for k, v in grade_dict.items():
out = df[k].reindex(grade_dict[k], axis=0, fill_value=0)
x = pd.concat([out], axis=1)
x[k] = x.index
x = x.reset_index(drop=True)
df[k] = x.fillna(np.nan)
Here is a solution using two consecutive merges:
# set up combinations
from itertools import zip_longest
df2 = pd.DataFrame(list(zip_longest(*grade_dict.values())), columns=grade_dict)
# merge
(df2.merge(df.filter(like='grade_1'),
on='grade_1', how='left')
.merge(df.filter(like='grade_2'),
on='grade_2', how='left')
.sort_index(axis=1)
)
output:
grade_1 grade_1_count grade_2 grade_2_count
0 A 19.0 pass 39.0
1 B 28.0 fail 18.0
2 C 32.0 not present NaN
3 D NaN borderline NaN
4 E NaN None NaN
5 F NaN None NaN
multiple merges:
df2 = pd.DataFrame(list(zip_longest(*grade_dict.values())), columns=grade_dict)
for col in grade_dict:
df2 = df2.merge(df.filter(like=col),
on=col, how='left')
df2
If you only need to merge on grade_1 without updating the non-NaNs of grade_2, you can cast grade_dict into a df and then use combine_first:
print (df.set_index("grade_1").combine_first(pd.DataFrame(grade_dict.values(),
index=grade_dict.keys()).T.set_index("grade_1"))
.fillna({"grade_1_count": 0}).reset_index())
grade_1 grade_1_count grade_2 grade_2_count
0 A 19.0 pass 39.0
1 B 28.0 fail 18.0
2 C 32.0 not present NaN
3 D 0.0 borderline NaN
4 E 0.0 None NaN
5 F 0.0 None NaN

search for value with in column by multiindex and take value of another column

I have a multiindex dataframe df and I have a second dataframe df1. I like to search in df1 for "SPX" after the value of "correl" an add in df the value in the column "correl":
import pandas as pd
import numpy as np
np.arrays = [['one','one','one','two','two','two'],
["DJ30","SPX","Example","Example","Example","Example"]]
df = pd.DataFrame(columns=[])
df = pd.DataFrame(np.random.randn(6,2),
index=pd.MultiIndex.from_tuples(list(zip(*np.arrays))),
columns=['correl','beta'])
df['correl'] = ''
df['beta'] = ''
df
df1 = pd.DataFrame([[0.95, 0.7, "SPX"]],
columns=['correl', 'beta', 'index'])
df1
I expect:
correl whatever
one DJ30
SPX 0.95
Example
two
Example
Example
Example
You can reset_index, merge and set_index:
df.reset_index().merge(df1,
left_on='level_1',
right_on='index',
suffixes=('_x',''),
how='left')\
.set_index(['level_0','level_1'])
Output:
correl beta index
level_0 level_1
one DJ30 NaN NaN NaN
SPX 0.95 0.7 SPX
Example NaN NaN NaN
two Example NaN NaN NaN
Example NaN NaN NaN
Example NaN NaN NaN

pandas.DataFrame: How to align / group and sort data by index?

I'm new to pandas and still don't have a good overview about its power and how to use it. So the problem is hopefully simple :)
I have a DataFrame with a date-index and several columns (stocks and their Open and Close-prices). Here is some example data for two stocks A and B:
import pandas as pd
_ = pd.to_datetime
A_dt = [_('2018-01-04'), _('2018-01-01'), _('2018-01-05')]
B_dt = [_('2018-01-01'), _('2018-01-05'), _('2018-01-03'), _('2018-01-02')]
A_data = [(12, 11), (10, 9), (8, 9)]
B_data = [(2, 2), (3, 4), (4, 4), (5, 3)]
As you see the data is incomplete, different missing dates for each series. I want to put these data together in a single dataframe with sorted row-index dt and 4 columns (2 stocks x 2 time series each).
When I do it this way, everything works fine (except that I'd like to change the column-levels and don't know how to do it):
# MultiIndex on axis 0, then unstacking
i0_a = pd.MultiIndex.from_tuples([("A", x) for x in A_dt], names=['symbol', 'dt'])
i0_b = pd.MultiIndex.from_tuples([("B", x) for x in B_dt], names=['symbol', 'dt'])
df0_a = pd.DataFrame(A_data, index=i0_a, columns=["Open", "Close"])
df0_b = pd.DataFrame(B_data, index=i0_b, columns=["Open", "Close"])
df = pd.concat([df0_a, df0_b])
df = df.unstack('symbol') # this automatically sorts by dt.
print df
# Open Close
#symbol A B A B
#dt
#2018-01-01 10.0 2.0 9.0 2.0
#2018-01-02 NaN 5.0 NaN 3.0
#2018-01-03 NaN 4.0 NaN 4.0
#2018-01-04 12.0 NaN 11.0 NaN
#2018-01-05 8.0 3.0 9.0 4.0
However when I put the MultiIndex on the columns, things are different
# MultiIndex on axis 1
i1_a = pd.MultiIndex.from_tuples([("A", "Open"), ("A", "Close")], names=['symbol', 'series'])
i1_b = pd.MultiIndex.from_tuples([("B", "Open"), ("B", "Close")], names=['symbol', 'series'])
df1_a = pd.DataFrame(A_data, index=A_dt, columns=i1_a)
df1_b = pd.DataFrame(B_data, index=B_dt, columns=i1_b)
df = pd.concat([df1_a, df1_b])
print df
#symbol A B
#series Close Open Close Open
#2018-01-04 11.0 12.0 NaN NaN
#2018-01-01 9.0 10.0 NaN NaN
#2018-01-05 9.0 8.0 NaN NaN
#2018-01-01 NaN NaN 2.0 2.0
#2018-01-05 NaN NaN 4.0 3.0
#2018-01-03 NaN NaN 4.0 4.0
#2018-01-02 NaN NaN 3.0 5.0
Why isn't the data aligned automatically in this case, but in the other?
How can I align and sort it in the second example?
Which method would probably be faster on a large dataset (about 5000 stocks, 1000 timesteps and not only 2 series per stock (Open, Close), but about 20)? This will finally be used as input for a keras machine learning model.
Edit: With jezraels answer I timed 3 different methods of concat / combining DataFrames. My first approach is the fastest. Using combine_first turns out to be an order of magnitude slower than the other methods. The size of the data is still kept very small in the example:
import timeit
setup = """
import pandas as pd
import numpy as np
stocks = 20
steps = 20
features = 10
data = []
index_method1 = []
index_method2 = []
cols_method1 = []
cols_method2 = []
df = None
for s in range(stocks):
name = "stock{0}".format(s)
index = np.arange(steps)
data.append(np.random.rand(steps, features))
index_method1.append(pd.MultiIndex.from_tuples([(name, x) for x in index], names=['symbol', 'dt']))
index_method2.append(index)
cols_method1.append([chr(65 + x) for x in range(features)])
cols_method2.append(pd.MultiIndex.from_arrays([[name] * features, [chr(65 + x) for x in range(features)]], names=['symbol', 'series']))
"""
method1 = """
for s in range(stocks):
df_new = pd.DataFrame(data[s], index=index_method1[s], columns=cols_method1[s])
if s == 0:
df = df_new
else:
df = pd.concat([df, df_new])
df = df.unstack('symbol')
"""
method2 = """
for s in range(stocks):
df_new = pd.DataFrame(data[s], index=index_method2[s], columns=cols_method2[s])
if s == 0:
df = df_new
else:
df = df.combine_first(df_new)
"""
method3 = """
for s in range(stocks):
df_new = pd.DataFrame(data[s], index=index_method2[s], columns=cols_method2[s])
if s == 0:
df = df_new.stack()
else:
df = pd.concat([df, df_new.stack()], axis=1)
df = df.unstack().swaplevel(0,1, axis=1).sort_index(axis=1)
"""
print ("Multi-Index axis 0, then concat: {} s".format((timeit.timeit(method1, setup, number=1))))
print ("Multi-Index axis 1, combine_first: {} s".format((timeit.timeit(method2, setup, number=1))))
print ("Stack and then concat: {} s".format((timeit.timeit(method3, setup, number=1))))
Multi-Index axis 0, then concat: 0.134283173989 s
Multi-Index axis 1, combine_first: 5.02396191049 s
Stack and then concat: 0.272278263371 s
It is problem because both DataFrames have different MultiIndex in columns, so no align.
Solution is stack for Series, concat to 2 column DataFrame, then unstack and for correct order of MultiIndex add swaplevel and sort_index:
df = (pd.concat([df1_a.stack(), df1_b.stack()], axis=1)
.unstack()
.swaplevel(0,1, axis=1)
.sort_index(axis=1))
print (df)
series Close Open
symbol A B A B
2018-01-01 9.0 2.0 10.0 2.0
2018-01-02 NaN 3.0 NaN 5.0
2018-01-03 NaN 4.0 NaN 4.0
2018-01-04 11.0 NaN 12.0 NaN
2018-01-05 9.0 4.0 8.0 3.0
But better is use combine_first:
df = df1_a.combine_first(df1_b)
print (df)
symbol A B
series Close Open Close Open
2018-01-01 9.0 10.0 2.0 2.0
2018-01-02 NaN NaN 3.0 5.0
2018-01-03 NaN NaN 4.0 4.0
2018-01-04 11.0 12.0 NaN NaN
2018-01-05 9.0 8.0 4.0 3.0

Categories

Resources