Add filename as header while merging csv files - python

Want to combine all csv in one folder. This works as intended.
import os
import glob
import pandas as pd
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames], axis = 1)
#export to csv
combined_csv.to_csv( "combined.matrix", index=False)
However I would to add the filename without extension as header.
File1.csv
A,B
1,2
3,4
File2.csv
A,B
5,6
combined.matrix
File1,File1,File2,File2
A,B,A,B
1,2,5,6
3,4,,

Try the below code:
import pandas as pd
all_filenames = ['File1.csv','File2.csv']
headers = []
for i in all_filenames:
headers.append(i.replace('.csv', ''))
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames], keys=headers, axis = 1)
Created a header list with file names excluding the extension. Pass the list to keys argument in pd.concat function.

The basic idea being that you can include the file names somewhere in the DataFrame itself (in this case I am including it in the column names, you could probably include them in a row as well) as you are anyway exporting it to csv for further processing
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
# This takes the value ["file1.csv", "file2.csv"]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames], axis = 1)
# This looks like
# A B A B
# 1 2 5 6
# 3 4 nan nan
As the column names are fixed (A and B) - and you are more interested in the file names, you can change the columns with
combined_csv.columns = sorted(all_filenames * len(combined_csv.columns) / len(all_filenames))
# This evaluates to sorted(["file1.csv", "file2.csv"] * 4 / 2) which is equal to ["file1.csv", "file1.csv", "file2.csv", "file2.csv"]
And now your dataframe would look like - which indicates which column is from which file
# file1.csv file1.csv file2.csv file2.csv
# 1 2 5 6
# 3 4 nan nan
Which you can export to the combined.matrix.csv

import os
import pandas as pd
parent_dir = 'YOUR_PARENT_DIRECTORY_PATH'
ext = 'csv'
combined_csv = pd.DataFrame()
for root, dir, files in os.walk(parent_dir):
for f in files:
path = os.path.join(root, f)
filename, extension = os.path.splitext(f)
if extension == f'.{ext}':
new_df = pd.read_csv(path)
cols = new_df.columns
new_cols = []
for c in cols:
new_cols.append(f'{filename}{c}')
new_df.columns = new_cols
combined_csv = pd.concat([combined_csv, new_df], axis=1)
combined_csv.to_csv( "combined.matrix", index=False)

Related

Adding dataframe column names based on filename after merging using Glob

I have Excel files in a folder, all in the same format with data for all countries in the world in the sheet 'Dataset2' in each file.
I have merged all files together into one using glob, but I need to know which file (i.e. which country) each column comes from.
Is there a way to do this?
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined = pd.concat([pd.read_excel(f, sheet_name='Dataset2') for f in all_filenames ],axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
You could unpack the list comprehension into a for-loop and add an additional column to each data file, something like this:
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
file_list = []
for f in all_filenames:
data = pd.read_excel(f, sheet_name='Dataset2')
data['source_file'] = f # create a column with the name of the file
file_list.append(data)
combined = pd.concat(file_list, axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
if you're using os module try path.basename and adding this to the key argument in concat:
import glob
import os
import pandas as pd
os.chdir(r"C:\Users\Umar.Hussain\OneDrive - Ricoh Europe PLC\Documents\Excels")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
names = [os.path.basename(f) for f in all_filenames]
combined = pd.concat([pd.read_excel(f, sheet_name='Sheet1') for f in all_filenames],keys=names,axis=1 )
as your using axis=1 this will add the keys to the header, so may want to read the excels first and add it to a list like :
dfs = []
for file in all_filenames:
df = pd.read_excel(file)
df['source'] = os.path.basename(file)
dfs.append(df)

Import multiple excel files and merge into single pandas df with source name as column

I'm trying to merge a bunch of xlsx files into a single pandas dataframe in python. Furthermore, I want to include a column that lists the source file for each row. My code is as follows:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import glob
import os
# get the path for where the xlsx files are
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
# create new dataframe
df = pd.DataFrame()
# read data from files and add into dataframe
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet 1')
df['Source_file'] = f
df = df.append(data)
however, when I look at the 'Source_file' column it lists the final file it reads as the name for every row. I've spent way more time than I should trying to fix this. What am I doing wrong?
within your for loop you are writing over each iteration of df so you'll only get back the final file,
what you need to do is delcare a list before hand and append do that,
since you called glob lets use that as well.
files = glob.glob(os.path.join(os.getcwd()) + '\*.xlsx')
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
df = pd.concat(dfs)
if you want to add the filename into the df too then,
files = glob.glob(os.path.join(os.getcwd()) + '\*.xlsx')
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
file_names = [os.path.basename(f) for f in files]
df = pd.concat(dfs,keys=file_names)
Using Pathlib module (recommended Python 3.4+)
from pathlib import Path
files = [f for f in Path.cwd().glob('*.xlsx')]
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
file_names = [f.stem for f in files]
df = pd.concat(dfs,keys=file_names)
or as a one liner :
df = pd.concat([pd.read_excel(f) for f in Path.cwd().glob('*.xlsx')],keys=[f.stem for f in Path.cwd().glob('*.xlsx')],sort=False)

Need to pick 'second column' from multiple csv files and save all 'second columns' in one csv file

So I have 366 CSV files and I want to copy their second columns and write them into a new CSV file. Need a code for this job. I tried some codes available here but nothing works. please help.
Assuming all the 2nd columns are the same length, you could simply loop through all the files. Read them, save the 2nd column to memory and construct a new df along the way.
filenames = ['test.csv', ....]
new_df = pd.DataFrame()
for filename in filenames:
df = pd.read_csv(filename)
second_column = df.iloc[:, 1]
new_df[f'SECOND_COLUMN_{filename.upper()}'] = second_column
del(df)
new_df.to_csv('new_csv.csv', index=False)
filenames = glob.glob(r'D:/CSV_FOLDER' + "/*.csv")
new_df = pd.DataFrame()
for filename in filenames:
df = pd.read_csv(filename)
second_column = df.iloc[:, 1]
new_df[f'SECOND_COLUMN_{filename.upper()}'] = second_column
del(df)
new_df.to_csv('new_csv.csv', index=False)
This can accomplished with glob and pandas:
import glob
import pandas as pd
mylist = [f for f in glob.glob("*.csv")]
df = pd.read_csv(mylist[0]) #create the dataframe from the first csv
df = pd.DataFrame(df.iloc[:,1]) #only keep 2nd column
for x in mylist[1:]: #loop through the rest of the csv files doing the same
t = pd.read_csv(x)
colName = pd.DataFrame(t.iloc[:,1]).columns
df[colName] = pd.DataFrame(t.iloc[:,1])
df.to_csv('output.csv', index=False)
import glob
import pandas as pd
mylist = [f for f in glob.glob("*.csv")]
df = pd.read_csv(csvList[0]) #create the dataframe from the first csv
df = pd.DataFrame(df.iloc[:,0]) #only keep 2nd column
for x in mylist[1:]: #loop through the rest of the csv files doing the same
t = pd.read_csv(x)
colName = pd.DataFrame(t.iloc[:,0]).columns
df[colName] = pd.DataFrame(t.iloc[:,0])
df.to_csv('output.csv', index=False)

Importing multiple excel files into Python, merge and apply filename to a new column

I have a for loop that imports all of the Excel files in the directory and merge them together in a single dataframe. However, I want to create a new column where each row takes the string of the filename of the Excel-file.
Here is my import and merge code:
path = os.getcwd()
files = os.listdir(path)
df = pd.DataFrame()
for f in files:
data = pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2'])
df = df.append(data)
For example if first Excel file is named "file1.xlsx", I want all rows from that file to have value file1.xlsx in col3 (a new column). If the second Excel file is named "file2.xlsx" I want all rows from that file to have value file2.xlsx. Notice that there is no real pattern of the Excel files, and I just use those names as an example.
Many thanks
Create new column in loop:
df = pd.DataFrame()
for f in files:
data = pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2'])
data['col3'] = f
df = df.append(data)
Another possible solution with list comprehension:
dfs = [pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2']).assign(col3 = f)
for f in files]
df = pd.concat(dfs)

Python: read through multiple but not all csv files in my folder

I want to read some csv files from my folder and concatenate them to a big pandas dataframe. All of my csv files end with a number, and I only want to read files whose number end with (6~10, 16~20, 26~30.) My goal is to read the files iteratively. Attached is my code so far:
import pandas as pd
data_one = pd.read_csv('Datafile-6.csv', header=None)
for i in range(7,11):
data99 = pd.read_csv('Datafile-'+i+'*.csv', header=None) #this line needs work
data_one = pd.concat([data_one, data99.iloc[:,1]],axis=1,ignore_index=True)
data_two = pd.read_csv('Datafile-16.csv', header=None)
for j in range(17,21):
#Repeat similar process
What should I do about 'data99' such that 'data_one' contains columns from 'Datafile-6' through 'Datafile-10'?
The first five rows of data_one should look like this, after getting data from Datafiles 6-10.
0 1 2 3 4 5
0 -40.0 0.179836 0.179630 0.179397 0.179192 0.179031
1 -39.0 0.183696 0.183441 0.183204 0.182977 0.182795
2 -38.0 0.186720 0.186446 0.186191 0.185949 0.185762
3 -37.0 0.189490 0.189207 0.188935 0.188686 0.188475
4 -36.0 0.192154 0.191851 0.191569 0.191301 0.191086
Column 0 is included in all of the data files, so I'm only concatenating column 1 of all of the subsequent data files.
You need to use glob module:
import glob, os
import pandas as pd
path =r'C:\YourFolder' #path to folder with .csv files
all = glob.glob(path + "/*.csv")
d_frame = pd.DataFrame()
list_ = []
for file_ in all:
df = pd.read_csv(file_,index_col=None, header=0)
if df['YourColumns'].tail(1).isin([6,7,8,9,10,16,17,18,19,20,26,27,28,29,30]) == True: #You can modify list with conditions you need
list_.append(df)
d_frame = pd.concat(list_)

Categories

Resources