I'm trying to merge multiple csv to one bigfile.
The script is working but I would like to have only the first header, and not one for each csv within bigfile.
How could I do it, shouldn't work with header=None?
import os
import glob
import pandas
def concatenate(inDir = r'myPath', outFile = r"outPath"):
os.chdir(inDir)
fileList = glob.glob("*.csv") #generate a list of csv files using the glob method
dfList = []
for filename in fileList:
print (filename)
df = pandas.read_csv(filename, header=None)
dfList.append(df)
concatDf = pandas.concat(dfList, axis=0)
concatDf.to_csv(outfile, index=None) # export the dataframe to a csv file
Related
I need to extract an Excel worksheet from multiple workbooks and saving it to a dataframe and in turn saving that dataframe.
I have a spreadsheet that is generated at the end of each month (e.g.
June 2019.xlsx, May 2019.xlsx, April 2019.xlsx).
I need to grab a worksheet 'Sheet1'from each of these workbooks and convert these to a dataframe (df1).
I would like to have this dataframe saved.
As a nice to have, I would also like some way just to append the next month's data after the initial 'data grab'.
I'm relatively new to this, so I haven't made much progress.
import os
import glob
import pandas as pd
import xlrd
import json
import io
import flatten_json
files = glob.glob('/Users/ngove/Documents/Python Scripts/2019/*.xlsx')
dfs={}
for f in files:
dfs[os.path.splitext(os.path.basename(f))[0]] = pd.read_excel(f)
You can drop all of your files in a directory (e.g. current directory). Then append all of your excel files in a list (e.g. files_xls). Iterate over all your files and use pandas.read_excel to get the respective dataframes (e.g. list_frames).
Below, you can find an example:
import os
import pandas as pd
path = os.getcwd() # get cur dir
files = os.listdir(path) # get all the files in your cur dir
# get only the xls or xlsm (this depends on you)
files_xls = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsm')]
df = pd.DataFrame()
list_frames = []
for f in files_xls:
print("Processing file: %s" %f)
try:
# the following will give you the dataframe
# the fun params depends on your data format
data = pd.read_excel(f, 'Sheet1', header=0, index_col=None,
sep='delimiter', error_bad_lines=False,
skip_blank_lines=True, comment=',,')
except:
pass
list_frames.append(data)
# at the end you can concat your data if you want and remove any dublicate
df = pd.concat(list_frames, sort=False).fillna(0)
df = df.drop_duplicates()
# at the end you can save it
writer = pd.ExcelWriter("your_title" + ".xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name="Sheets1", index=False)
writer.save()
I hope this helps.
I interpreted your statement that you want to save the dataframe as that you want to save it as a combined Excel file. This will combine all files in the folder specified that end in xlsx.
import os
import pandas as pd
from pandas import ExcelWriter
os.chdir("H:/Python/Reports/") #edit this to be your path
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
writer=ExcelWriter('Combined_Data.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
You could update the code to grab all 2019 files by changing the one line to this:
files_xlsx = [f for f in files if f[-9:] == '2019.xlsx']
I referenced this question for most of the code and updated for xlsx and added the file save portion of the code
I'm new to pandas and python so ran into some trouble. I have a one large excel file which i need to divide into multiple worksheets using a python script. That i have to divide base on the ip addresses given in the data. I can't figure out how to do that and would appreciate some help and guidance.
I have no knowledge of working with python or any libraries before. This is what i did but created workbooks for each row.
import pandas as pd
df = pd.read_excel("D:/Users/Zakir/Desktop/MyNotebooks/Legacy.xls", sheet_name="Total", header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True)
writer = pd.ExcelWriter('D:/Users/Zakir/Desktop/MyNotebooks/pandas_simple.xlsx', engine='xlsxwriter')
for index, row in df.iterrows():
df1 = df.iloc[[index]]
df1.set_index('Number',inplace=True)
df1.to_excel(writer, sheet_name=row['IPAddress'])
writer.save()
This is the kind of excel file i have. over 5000 rows. There are 60 groups of ip addresses and have to divide each group into its own worksheet
one solution if you have enough memory:
from pandas import ExcelWriter
df = pd.read_excel('file',sheet_name="Total", header=0, #other settings.....#)
writer = ExcelWriter('E:/output.xlsx',engine='xlsxwriter')
print(df)
def writesheet(g):
a = g['IPAddress'].tolist()[0]
g.to_excel(writer, sheet_name=str(a), index=False)# index = True if you want to keep index
df.groupby('IPAddress').apply(writesheet)
writer.save()
This is how I implemented the code to check a folder, loop through all excel files and split each file by the values of a column name, which can be passed as input(vColName), assuming one sheet in the file:
import sys
import os, shutil
from os import listdir
from os.path import isfile, join
import pandas as pd
import urllib as ul
import datetime
import xlrd
#this method retrieves all the xlsx filenames from a folder
def find_excel_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
#this folder contains .xlsx files
filePath = "D:\files\sample\"
#there is a subfolder in my solution to move the processed files to
#and another subfolder to move the splitted output files
archivePath = os.path.join(filePath, "archive")
outPath = os.path.join(filePath, "output")
#get a list of filenames
fnames = find_excel_filenames(filePath)
#loop through each file
for fl in fnames:
vFile = os.path.join(filePath, fl)
#load the content of the file to a data frame,
#I open the file twice, first to get the number of columns and
#create the converter, then to open the file with string converter
#it helps with trimming of leading zeros
df = pd.read_excel(vFile, header=None)
column_list = []
for i in df:
column_list.append(i)
converter = {col: str for col in column_list}
df1 = pd.read_excel(vFile, converters=converter)
colValues=df1[vColName].unique().tolist()
for v in colValues:
filteredDF = df1.loc[df1[vColName]==v]
vOutFile = os.path.join(outPath, fl+''_''+v.replace("/"," ")+''.xlsx'')
writer = pd.ExcelWriter(vOutFile, engine=''xlsxwriter'')
# Convert the dataframe to an XlsxWriter Excel object.
filteredDF.to_excel(writer, sheet_name=''Sheet1'')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
#move the processed file to an archive folder
dst_file = os.path.join(archivePath, fl)
if os.path.exists(dst_file):
os.remove(dst_file)
shutil.move(vFile, archivePath)
I am working with lots of csv files and need to add column. I tried glob, for example:
import glob
filenames = sorted(glob.glob('./DATA1/*2018*.csv'))
filenames = filenames[0:10]
import numpy as np
import pandas as pd
for f in filenames:
df = pd.read_csv(f, header=None, index_col=None)
df.columns = ['Date','Signal','Data','Code']
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
and I need a way to save the file back to csv (not concatenated) with different name such as "file01edited.csv" after I add the column to each csv file.
Use to_csv with f-strings for change file names:
for f in filenames:
df = pd.read_csv(f, names=['Date','Signal','Data','Code'], index_col=None)
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
#python 3.6+
df.to_csv(f'{f[:-4]}edited.csv', index=False)
#python bellow 3.6
#df.to_csv('{}edited.csv'.format(f[:-4]), index=False)
I need to concatenate csv files with same column headers in python. The csv files with the following filenames should concatenate in order as shown below(ascending order of filename):
AB201602.csv
AB201603.csv
AB201604.csv
AB201605.csv
AB201606.csv
AB201607.csv
AB201608.csv
AB201610.csv
AB201612.csv
I would like to keep the column headers only from first file. Any idea?
I tried to use the below code and it combined the csv file by random filenames and truncated half of the column header name. thanks
csvfiles = glob.glob('/home/c/*.csv')
wf = csv.writer(open('/home/c/output.csv','wb'),delimiter = ',')
for files in csvfiles:
rd = csv.reader(open(files,'r'),delimiter = ',')
rd.next()
for row in rd:
print(row)
wf.writerow(row)
Using #Gokul comment and pandas.
import pandas as pd
import glob
csvfiles = sorted(glob.glob('/home/c/*.csv'))
df = pd.DataFrame()
for files in csvfiles:
df = df.append(pd.read_csv(files))
df.to_csv('newfile.csv')
I have this code
import pandas as p
import csv
df = p.read_csv('interview1.csv')
df2 = df[['Participant', 'Translation']] # selects two of the columns in your file
df2.to_csv('out.csv')
How do i read multiple files and then write to 'out.csv'. So basically, instead of reading only interview1, i read interview2, interview3 to interview7 as well into the out.csv
Simply open the output file in append mode:
import pandas as p
import csv
csv_list=['interview1.csv', 'interview2.csv', ...]
for itw in csv_list:
df = p.read_csv(itw)
df.to_csv('out.csv', mode='a')
Use this to read all .CSV data from a folder and combined it together
import pandas as pd
import glob
import os
path = r'file path'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df.to_csv("combined-data_new.csv")