I hope you can help me with this problem.
I am having issues with adding multiple CSV files in pandas.
I have 12 files of sales data that have the same columns (one for each month: Sales_January_2019, Sales_February_2019.... and so on until December).
I've tried the following code but seems not working, also the index number should be continuous and not reset after each file. I tried with reset_index() but also didn't work.
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=0, header=0)
li.append(df)
df.reset_index(inplace=True)
frame = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns = ['x_t', 'perf'], inplace=True)
print(df)
Try correcting your code like this:
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
files = glob.glob(path + "/*.csv")
# Make a list of dataframes
li = [pd.read_csv(file, index_col=0, header=0) for file in files]
# Concatenate dataframes and remove useless columns
df = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns=["x_t", "perf"], inplace=True)
print(df)
Related
I have to build a solution that unifies all excel files in a folder and generates a new consolidated excel with all information. The files have the same amount of tabs (3) and same name.
I tried this way:
import pandas as pd
import glob
path = "C:\Users\Alan\Desktop\"
filenames = glob.glob(path + "\*.xlsx")
outputxlsx = pd.DataFrame()
for file in filenames:
df = pd.concat(pd.read_excel( file, sheet_name=None), ignore_index=True, sort=False)
outputxlsx = outputxlsx.append( df, ignore_index=True)
outputxlsx.to_excel("C:\Users\Alan\Desktop\Output.xlsx", index=False)
Unfortunately on the first tab the header is replicated and the other two tabs are not generated.
from pathlib import Path
import pandas as pd
def get_data_by_sheet(file_path: str) -> dict:
return {x: df for x, df in pd.read_excel(file_path, sheet_name=None).items() if not df.empty}
path = "C:/Users/Alan/Desktop/"
all_files = [x for x in Path(path).rglob("*.xlsx")]
(pd
.concat([pd.concat([df for sheet, df in list(get_data_by_sheet(file_path=file).items())]) for file in all_files])
.reset_index(drop=True)
).to_excel(f"{path}final_df.xlsx", index=False)
Or if you also want to know what workbook and sheet each row came from:
(pd
.concat(
[pd.concat([df.assign(file_name=Path(file).stem).assign(sheet_name=sheet) for
sheet, df in list(all_sheets_mapping(file_path=file).items())]) for file in all_files]
).reset_index(drop=True)).to_excel(f"{path}final_df.xlsx", index=False)
I have everyday multiple excel files with different names, but all these files start with the same name, for instance, "Answer1.xlsx", "AnswerAVD.xlsx","Answer2312.xlsx", etc.
Is it possible to read and concatenate all these files in a pandas dataframe?
I Know how to do one by one, but is not a solution
import pandas as pd
dfs1 = pd.read_excel('C:/Answer1.xlsx')
dfs2 = pd.read_excel('C:/AnswerAVD.xlsx')
dfs3 = pd.read_excel('C:/Answer2312.xlsx')
Final=pd.concat([dfs1 , dfs2 ,dfs3 ])
Many thanks for your help
use a glob method with pathlib and then concat using pandas and a list comprehension.
from pathlib import Path
import pandas as pd
src_files = Path('C:\\').glob('*Answer*.xlsx')
df = pd.concat([pd.read_excel(f, index_col=None, header=0) for f in src_files])
#Kardu This will help you do this in a concise manner and there are many useful comments also for other alternatives.
Also, inspired by the same post, this should help.
import pandas as pd
import glob
path = r'C:\' # use your path
all_files = glob.glob(path + "/Answer*.xlsx")
li = []
for filename in all_files:
df = pd.read_excel(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
I have a directory with several csvs.
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
Each csv has the same below columns. Reprex below-
yes no maybe ofcourse
1 2 3 4
I want my script to iterate through all csvs in the folder and delete the columns maybe and ofcourse.
If glob provides you with file paths, you can do the following with pandas:
import pandas as pd
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
drop = ['maybe ', 'ofcourse']
for file in files:
df = pd.read_csv(file)
for col in drop:
if col in df:
df = df.drop(col, axis=1)
df.to_csv(file)
Alternatively if you want a cleaner way to not get KeyErrors from drop you can do this:
import pandas as pd
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
drop = ['maybe ', 'ofcourse']
for file in files:
df = pd.read_csv(file)
df = df.drop([c for c in drop if c in df], axis=1)
df.to_csv(file)
Do you mean by:
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
for filename in files:
df = pd.read_csv(filename)
df = df.drop(['maybe ', 'ofcourse'], axis=1)
df.to_csv(filename)
This code will remove the maybe and ofcourse columns and save it back to the csv.
You can use panda to read csv file to a dataframe then use drop() to drop specific columns. something like below:
df = pd.read_csv(csv_filename)
df.drop(['maybe', 'ofcourse'], axis=1)
import pandas as pd
from glob import glob
files = glob(r'C:/Users/jj/Desktop/Bulk_Wav/*.csv')
for filename in files:
df = pd.read_csv(filename, sep='\t')
df.drop(['maybe', 'ofcourse'], axis=1, inplace=True)
df.to_csv(filename, sep='\t', index=False)
If the files look exactly like what you have there, then maybe something like this
import pandas as pd
import numpy as np
import feather
import glob
path = r'C:/Users/user1/Desktop/Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for i,filename in enumerate (all_files):
df = pd.read_csv(filename, ',' ,index_col=None, header=0).assign(user_iD=filename)
li.append(df)
data = pd.concat(li, axis=0, ignore_index=True)
df = data.copy()
df.to_feather('KT2test.ftr')
data1= pd.read_feather('KT2test.ftr')
data1.tail(50)
The output I'm getting in the user_iD column is C:/Users/user1/Desktop/Test\u9.csv
Although I only want user_id as u9 or only 9
How to get this done?
df = pd.read_csv(filename, ',' ,index_col=None, header=0).assign(user_iD=filename.split("\\")[-1].split(".")[0])
df = df.assign(user_iD=filename.split("\\u")[-1].split(".")[0])
I have 10 csv files with different dimension so I want to concat it there are 10 csv files.
which looks like, so I want to concatenate it make one file but whenever I am doing it the format changes.
I want one csv file
dfs = glob.glob(path + '*.csv')
result = pd.concat([pd.read_csv(df,header=None) for df in dfs])
result.to_csv(path + 'merge.csv',header=None)
You may want to combine csv files horizontally. Use axis=1 , for example,
df1 = pd.read_csv('f1.txt')
df2 = pd.read_csv('f2.txt')
combined = pd.concat([df1, df2], axis=1)
combined.to_csv('merged_csv.csv')
This worked for me:
import pandas as pd
import os
os.chdir(path)
dfs = [pd.read_csv(f, parse_dates=[0])
for f in os.listdir(os.getcwd()) if f.endswith('csv')]
result_csv = pd.concat(dfs, axis=1)
result_csv.to_csv('result.csv')
You have to use .concat.
The code below only works for few Csv file (3 here)
df1 = pd.read_csv(r"address\1.csv",
index_col=[0], parse_dates=[0])
df2 = pd.read_csv(r"address\2.csv",
index_col=[0], parse_dates=[0])
df3 = pd.read_csv(r"address\3.csv",
index_col=[0], parse_dates=[0])
finaldf = pd.concat([df1, df2, df3], axis=1, join='inner').sort_index()
finaldf.to_csv('result.csv')
With code below you can concat infinite csv file that you want in the same directory :
import pandas as pd
import os
os.chdir(path)
dfs = [pd.read_csv(f, index_col=[0], parse_dates=[0])
for f in os.listdir(os.getcwd()) if f.endswith('csv')]
result_csv = pd.concat(dfs, axis=1, join='inner').sort_index()
result_csv.to_csv('result.csv')
and it will save into result.csv in that same path that you used above.