I have this code
import pandas as p
import csv
df = p.read_csv('interview1.csv')
df2 = df[['Participant', 'Translation']] # selects two of the columns in your file
df2.to_csv('out.csv')
How do i read multiple files and then write to 'out.csv'. So basically, instead of reading only interview1, i read interview2, interview3 to interview7 as well into the out.csv
Simply open the output file in append mode:
import pandas as p
import csv
csv_list=['interview1.csv', 'interview2.csv', ...]
for itw in csv_list:
df = p.read_csv(itw)
df.to_csv('out.csv', mode='a')
Use this to read all .CSV data from a folder and combined it together
import pandas as pd
import glob
import os
path = r'file path'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df.to_csv("combined-data_new.csv")
Related
i have a code that takes all files from dir and create dataframe, then saves it to xls. How do I make it to just append new files instead of creating new dataframe each time from all files?
import os
import pandas as pd
import numpy as np
import glob
import time
path = r'C:\Users\user\Desktop\test' # path
all_files = glob.glob(os.path.join(path, "*.xlsm")) # advisable to use os.path.join as this makes concatenation OS independent
df_from_each_file = (pd.read_excel(f, sheet_name='Rank', header= 2, usecols=['Model', 'HK']) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df = concatenated_df.set_index('Model')
#Save dataframe to excel file
concatenated_df.to_excel(r'C:\Users\user\Desktop\test\output.xlsx')
I have an Excel file that needs to be refreshed automatically every week. It must be extended by other Excel files. The problem is that these files have different names each time.
So in my opinion i can not use code like:
import pandas as pd
NG = 'NG.xlsx'
df = pd.read_excel(NG)
because the filename is not always "NG" like in this case.
Do you have any ideas?
Best Greetz
You could read all the files in your folder by doing this, because it allows you to ignore name changes:
import sys
import csv
import glob
import pandas as pd
# get data file names
path = r"C:\.......\folder_with_excel"
filenames = glob.glob(path + "/*.xlsx")
DF = []
for df in dfs:
xl_file = pd.ExcelFile(filenames)
df=xl_file.parse('Sheet1')
DF.concat(df, ignore_index=True)
Alternatively:
import os
import pandas as pd
path = os.getcwd()
files = os.listdir(path) # list all the files in you directory
files_xls = [f for f in files if f[-3:] == 'xlsx'] # make a list of the xlsx
df = pd.DataFrame()
for f in files_xls:
info = pd.read_excel(f, '<sheet name>') # remove <sheet name if you don't need it
df = df.append(info)
I'm trying to merge multiple csv to one bigfile.
The script is working but I would like to have only the first header, and not one for each csv within bigfile.
How could I do it, shouldn't work with header=None?
import os
import glob
import pandas
def concatenate(inDir = r'myPath', outFile = r"outPath"):
os.chdir(inDir)
fileList = glob.glob("*.csv") #generate a list of csv files using the glob method
dfList = []
for filename in fileList:
print (filename)
df = pandas.read_csv(filename, header=None)
dfList.append(df)
concatDf = pandas.concat(dfList, axis=0)
concatDf.to_csv(outfile, index=None) # export the dataframe to a csv file
I'm trying to merge a bunch of xlsx files into a single pandas dataframe in python. Furthermore, I want to include a column that lists the source file for each row. My code is as follows:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import glob
import os
# get the path for where the xlsx files are
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
# create new dataframe
df = pd.DataFrame()
# read data from files and add into dataframe
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet 1')
df['Source_file'] = f
df = df.append(data)
however, when I look at the 'Source_file' column it lists the final file it reads as the name for every row. I've spent way more time than I should trying to fix this. What am I doing wrong?
within your for loop you are writing over each iteration of df so you'll only get back the final file,
what you need to do is delcare a list before hand and append do that,
since you called glob lets use that as well.
files = glob.glob(os.path.join(os.getcwd()) + '\*.xlsx')
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
df = pd.concat(dfs)
if you want to add the filename into the df too then,
files = glob.glob(os.path.join(os.getcwd()) + '\*.xlsx')
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
file_names = [os.path.basename(f) for f in files]
df = pd.concat(dfs,keys=file_names)
Using Pathlib module (recommended Python 3.4+)
from pathlib import Path
files = [f for f in Path.cwd().glob('*.xlsx')]
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
file_names = [f.stem for f in files]
df = pd.concat(dfs,keys=file_names)
or as a one liner :
df = pd.concat([pd.read_excel(f) for f in Path.cwd().glob('*.xlsx')],keys=[f.stem for f in Path.cwd().glob('*.xlsx')],sort=False)
I am working with lots of csv files and need to add column. I tried glob, for example:
import glob
filenames = sorted(glob.glob('./DATA1/*2018*.csv'))
filenames = filenames[0:10]
import numpy as np
import pandas as pd
for f in filenames:
df = pd.read_csv(f, header=None, index_col=None)
df.columns = ['Date','Signal','Data','Code']
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
and I need a way to save the file back to csv (not concatenated) with different name such as "file01edited.csv" after I add the column to each csv file.
Use to_csv with f-strings for change file names:
for f in filenames:
df = pd.read_csv(f, names=['Date','Signal','Data','Code'], index_col=None)
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
#python 3.6+
df.to_csv(f'{f[:-4]}edited.csv', index=False)
#python bellow 3.6
#df.to_csv('{}edited.csv'.format(f[:-4]), index=False)