to concat csv files having different dimensions - python

I have 10 csv files with different dimension so I want to concat it there are 10 csv files.
which looks like, so I want to concatenate it make one file but whenever I am doing it the format changes.
I want one csv file
dfs = glob.glob(path + '*.csv')
result = pd.concat([pd.read_csv(df,header=None) for df in dfs])
result.to_csv(path + 'merge.csv',header=None)

You may want to combine csv files horizontally. Use axis=1 , for example,
df1 = pd.read_csv('f1.txt')
df2 = pd.read_csv('f2.txt')
combined = pd.concat([df1, df2], axis=1)
combined.to_csv('merged_csv.csv')
This worked for me:
import pandas as pd
import os
os.chdir(path)
dfs = [pd.read_csv(f, parse_dates=[0])
for f in os.listdir(os.getcwd()) if f.endswith('csv')]
result_csv = pd.concat(dfs, axis=1)
result_csv.to_csv('result.csv')

You have to use .concat.
The code below only works for few Csv file (3 here)
df1 = pd.read_csv(r"address\1.csv",
index_col=[0], parse_dates=[0])
df2 = pd.read_csv(r"address\2.csv",
index_col=[0], parse_dates=[0])
df3 = pd.read_csv(r"address\3.csv",
index_col=[0], parse_dates=[0])
finaldf = pd.concat([df1, df2, df3], axis=1, join='inner').sort_index()
finaldf.to_csv('result.csv')
With code below you can concat infinite csv file that you want in the same directory :
import pandas as pd
import os
os.chdir(path)
dfs = [pd.read_csv(f, index_col=[0], parse_dates=[0])
for f in os.listdir(os.getcwd()) if f.endswith('csv')]
result_csv = pd.concat(dfs, axis=1, join='inner').sort_index()
result_csv.to_csv('result.csv')
and it will save into result.csv in that same path that you used above.

Related

Merge excel files from a folder with more than one tab

I have to build a solution that unifies all excel files in a folder and generates a new consolidated excel with all information. The files have the same amount of tabs (3) and same name.
I tried this way:
import pandas as pd
import glob
path = "C:\Users\Alan\Desktop\"
filenames = glob.glob(path + "\*.xlsx")
outputxlsx = pd.DataFrame()
for file in filenames:
df = pd.concat(pd.read_excel( file, sheet_name=None), ignore_index=True, sort=False)
outputxlsx = outputxlsx.append( df, ignore_index=True)
outputxlsx.to_excel("C:\Users\Alan\Desktop\Output.xlsx", index=False)
Unfortunately on the first tab the header is replicated and the other two tabs are not generated.
from pathlib import Path
import pandas as pd
def get_data_by_sheet(file_path: str) -> dict:
return {x: df for x, df in pd.read_excel(file_path, sheet_name=None).items() if not df.empty}
path = "C:/Users/Alan/Desktop/"
all_files = [x for x in Path(path).rglob("*.xlsx")]
(pd
.concat([pd.concat([df for sheet, df in list(get_data_by_sheet(file_path=file).items())]) for file in all_files])
.reset_index(drop=True)
).to_excel(f"{path}final_df.xlsx", index=False)
Or if you also want to know what workbook and sheet each row came from:
(pd
.concat(
[pd.concat([df.assign(file_name=Path(file).stem).assign(sheet_name=sheet) for
sheet, df in list(all_sheets_mapping(file_path=file).items())]) for file in all_files]
).reset_index(drop=True)).to_excel(f"{path}final_df.xlsx", index=False)

How do use python to iterate through a directory and delete specific columns from all csvs?

I have a directory with several csvs.
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
Each csv has the same below columns. Reprex below-
yes no maybe ofcourse
1 2 3 4
I want my script to iterate through all csvs in the folder and delete the columns maybe and ofcourse.
If glob provides you with file paths, you can do the following with pandas:
import pandas as pd
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
drop = ['maybe ', 'ofcourse']
for file in files:
df = pd.read_csv(file)
for col in drop:
if col in df:
df = df.drop(col, axis=1)
df.to_csv(file)
Alternatively if you want a cleaner way to not get KeyErrors from drop you can do this:
import pandas as pd
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
drop = ['maybe ', 'ofcourse']
for file in files:
df = pd.read_csv(file)
df = df.drop([c for c in drop if c in df], axis=1)
df.to_csv(file)
Do you mean by:
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
for filename in files:
df = pd.read_csv(filename)
df = df.drop(['maybe ', 'ofcourse'], axis=1)
df.to_csv(filename)
This code will remove the maybe and ofcourse columns and save it back to the csv.
You can use panda to read csv file to a dataframe then use drop() to drop specific columns. something like below:
df = pd.read_csv(csv_filename)
df.drop(['maybe', 'ofcourse'], axis=1)
import pandas as pd
from glob import glob
files = glob(r'C:/Users/jj/Desktop/Bulk_Wav/*.csv')
for filename in files:
df = pd.read_csv(filename, sep='\t')
df.drop(['maybe', 'ofcourse'], axis=1, inplace=True)
df.to_csv(filename, sep='\t', index=False)
If the files look exactly like what you have there, then maybe something like this

Multiple csv not being added to pandas

I hope you can help me with this problem.
I am having issues with adding multiple CSV files in pandas.
I have 12 files of sales data that have the same columns (one for each month: Sales_January_2019, Sales_February_2019.... and so on until December).
I've tried the following code but seems not working, also the index number should be continuous and not reset after each file. I tried with reset_index() but also didn't work.
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=0, header=0)
li.append(df)
df.reset_index(inplace=True)
frame = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns = ['x_t', 'perf'], inplace=True)
print(df)
Try correcting your code like this:
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
files = glob.glob(path + "/*.csv")
# Make a list of dataframes
li = [pd.read_csv(file, index_col=0, header=0) for file in files]
# Concatenate dataframes and remove useless columns
df = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns=["x_t", "perf"], inplace=True)
print(df)

Saving pandas dataframe as csv and overwrite existing file

I have always two dataframes which come from different directories with the same last four digits in their names. The filepaths are:
dir1 = "path/to/files1/"
dir2 = "path/to/files2/"
Then I use a loop to load and concatenate the dataframes which belong together to dataframe df.
# For each file in the first directory
for i in os.listdir(dir1):
# For each file in the second directory
for j in os.listdir(dir2):
# If the last 4 digits of filename match (ignoring file extension)
if i[-8:-4] == j[-8:-4]:
# Load CSVs into pandas
print(i[-12:-4] + ' CPU Analysis')
print('\n')
df1 = pd.read_csv(dir1 + i,delimiter=',')
df2 = pd.read_csv(dir2 + j,delimiter=';')
df = pd.concat([df1, df2])
What I now want to do is to store df in dir1 using the same filename as before, i.e. I want to overwrite the existing file in dir1 and save as csv.
So, I think I should use something like this at the end of the loop:
df.to_csv(dir1, i[:-4])
But I am not sure about this.
I think here is possible join values by +:
df = pd.concat([df1, df2])
df.to_csv(dir1 + i[:-4] + '.csv', index=False)
Or use f-strings:
df = pd.concat([df1, df2])
df.to_csv(f'{dir1}{i[:-4]}.csv', index=False)
But if need original extension use same path like for reading file:
df = pd.concat([df1, df2])
df.to_csv(dir1 + i, index=False)
df = pd.concat([df1, df2])
df.to_csv(f'{dir1}{i}', index=False)

Merged file excels overwritting first column in Python using Pandas

I have a lot of files excel, I want to append multiple excel files using the following code:
import pandas as pd
import glob
import os
import openpyxl
df = []
for f in glob.glob("*.xlsx"):
data = pd.read_excel(f, 'Sheet1')
data.index = [os.path.basename(f)] * len(data)
df.append(data)
df = pd.concat(df)
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
Excel files have this structure:
the output is the following:
Why does python alter the first column when concatenating excel files?
I think you need:
df = []
for f in glob.glob("*.xlsx"):
data = pd.read_excel(f, 'Sheet1')
name = os.path.basename(f)
#create Multiindex for not overwrite original index
data.index = pd.MultiIndex.from_product([[name], data.index], names=('files','orig'))
df.append(data)
#reset index for columns from MultiIndex
df = pd.concat(df).reset_index()
Another solution is use parameter keys in concat:
files = glob.glob("*.xlsx")
names = [os.path.basename(f) for f in files]
dfs = [pd.read_excel(f, 'Sheet1') for f in files]
df = pd.concat(dfs, keys=names).rename_axis(('files','orig')).reset_index()
What is same as:
df = []
names = []
for f in glob.glob(".xlsx"):
df.append(pd.read_excel(f, 'Sheet1'))
names.append(os.path.basename(f))
df = pd.concat(df, keys=names).rename_axis(('files','orig')).reset_index()
Last write to excel with no index and no columns names:
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1', index=False, header=False)
writer.save()

Categories

Resources