Merge multple csv files to one and hide the header - python

I'm trying to merge multiple csv to one bigfile.
The script is working but I would like to have only the first header, and not one for each csv within bigfile.
How could I do it, shouldn't work with header=None?
import os
import glob
import pandas
def concatenate(inDir = r'myPath', outFile = r"outPath"):
os.chdir(inDir)
fileList = glob.glob("*.csv") #generate a list of csv files using the glob method
dfList = []
for filename in fileList:
print (filename)
df = pandas.read_csv(filename, header=None)
dfList.append(df)
concatDf = pandas.concat(dfList, axis=0)
concatDf.to_csv(outfile, index=None) # export the dataframe to a csv file

Related

Grabbing a single Excel worksheet from multiple workbooks into a pandas dataframe and saving this

I need to extract an Excel worksheet from multiple workbooks and saving it to a dataframe and in turn saving that dataframe.
I have a spreadsheet that is generated at the end of each month (e.g.
June 2019.xlsx, May 2019.xlsx, April 2019.xlsx).
I need to grab a worksheet 'Sheet1'from each of these workbooks and convert these to a dataframe (df1).
I would like to have this dataframe saved.
As a nice to have, I would also like some way just to append the next month's data after the initial 'data grab'.
I'm relatively new to this, so I haven't made much progress.
import os
import glob
import pandas as pd
import xlrd
import json
import io
import flatten_json
files = glob.glob('/Users/ngove/Documents/Python Scripts/2019/*.xlsx')
dfs={}
for f in files:
dfs[os.path.splitext(os.path.basename(f))[0]] = pd.read_excel(f)
You can drop all of your files in a directory (e.g. current directory). Then append all of your excel files in a list (e.g. files_xls). Iterate over all your files and use pandas.read_excel to get the respective dataframes (e.g. list_frames).
Below, you can find an example:
import os
import pandas as pd
path = os.getcwd() # get cur dir
files = os.listdir(path) # get all the files in your cur dir
# get only the xls or xlsm (this depends on you)
files_xls = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsm')]
df = pd.DataFrame()
list_frames = []
for f in files_xls:
print("Processing file: %s" %f)
try:
# the following will give you the dataframe
# the fun params depends on your data format
data = pd.read_excel(f, 'Sheet1', header=0, index_col=None,
sep='delimiter', error_bad_lines=False,
skip_blank_lines=True, comment=',,')
except:
pass
list_frames.append(data)
# at the end you can concat your data if you want and remove any dublicate
df = pd.concat(list_frames, sort=False).fillna(0)
df = df.drop_duplicates()
# at the end you can save it
writer = pd.ExcelWriter("your_title" + ".xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name="Sheets1", index=False)
writer.save()
I hope this helps.
I interpreted your statement that you want to save the dataframe as that you want to save it as a combined Excel file. This will combine all files in the folder specified that end in xlsx.
import os
import pandas as pd
from pandas import ExcelWriter
os.chdir("H:/Python/Reports/") #edit this to be your path
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
writer=ExcelWriter('Combined_Data.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
You could update the code to grab all 2019 files by changing the one line to this:
files_xlsx = [f for f in files if f[-9:] == '2019.xlsx']
I referenced this question for most of the code and updated for xlsx and added the file save portion of the code

How to split a large excel file into multiple worksheets based on their given ip address using pandas python

I'm new to pandas and python so ran into some trouble. I have a one large excel file which i need to divide into multiple worksheets using a python script. That i have to divide base on the ip addresses given in the data. I can't figure out how to do that and would appreciate some help and guidance.
I have no knowledge of working with python or any libraries before. This is what i did but created workbooks for each row.
import pandas as pd
df = pd.read_excel("D:/Users/Zakir/Desktop/MyNotebooks/Legacy.xls", sheet_name="Total", header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True)
writer = pd.ExcelWriter('D:/Users/Zakir/Desktop/MyNotebooks/pandas_simple.xlsx', engine='xlsxwriter')
for index, row in df.iterrows():
df1 = df.iloc[[index]]
df1.set_index('Number',inplace=True)
df1.to_excel(writer, sheet_name=row['IPAddress'])
writer.save()
This is the kind of excel file i have. over 5000 rows. There are 60 groups of ip addresses and have to divide each group into its own worksheet
one solution if you have enough memory:
from pandas import ExcelWriter
df = pd.read_excel('file',sheet_name="Total", header=0, #other settings.....#)
writer = ExcelWriter('E:/output.xlsx',engine='xlsxwriter')
print(df)
def writesheet(g):
a = g['IPAddress'].tolist()[0]
g.to_excel(writer, sheet_name=str(a), index=False)# index = True if you want to keep index
df.groupby('IPAddress').apply(writesheet)
writer.save()
This is how I implemented the code to check a folder, loop through all excel files and split each file by the values of a column name, which can be passed as input(vColName), assuming one sheet in the file:
import sys
import os, shutil
from os import listdir
from os.path import isfile, join
import pandas as pd
import urllib as ul
import datetime
import xlrd
#this method retrieves all the xlsx filenames from a folder
def find_excel_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
#this folder contains .xlsx files
filePath = "D:\files\sample\"
#there is a subfolder in my solution to move the processed files to
#and another subfolder to move the splitted output files
archivePath = os.path.join(filePath, "archive")
outPath = os.path.join(filePath, "output")
#get a list of filenames
fnames = find_excel_filenames(filePath)
#loop through each file
for fl in fnames:
vFile = os.path.join(filePath, fl)
#load the content of the file to a data frame,
#I open the file twice, first to get the number of columns and
#create the converter, then to open the file with string converter
#it helps with trimming of leading zeros
df = pd.read_excel(vFile, header=None)
column_list = []
for i in df:
column_list.append(i)
converter = {col: str for col in column_list}
df1 = pd.read_excel(vFile, converters=converter)
colValues=df1[vColName].unique().tolist()
for v in colValues:
filteredDF = df1.loc[df1[vColName]==v]
vOutFile = os.path.join(outPath, fl+''_''+v.replace("/"," ")+''.xlsx'')
writer = pd.ExcelWriter(vOutFile, engine=''xlsxwriter'')
# Convert the dataframe to an XlsxWriter Excel object.
filteredDF.to_excel(writer, sheet_name=''Sheet1'')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
#move the processed file to an archive folder
dst_file = os.path.join(archivePath, fl)
if os.path.exists(dst_file):
os.remove(dst_file)
shutil.move(vFile, archivePath)

Write to multiple csv using glob

I am working with lots of csv files and need to add column. I tried glob, for example:
import glob
filenames = sorted(glob.glob('./DATA1/*2018*.csv'))
filenames = filenames[0:10]
import numpy as np
import pandas as pd
for f in filenames:
df = pd.read_csv(f, header=None, index_col=None)
df.columns = ['Date','Signal','Data','Code']
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
and I need a way to save the file back to csv (not concatenated) with different name such as "file01edited.csv" after I add the column to each csv file.
Use to_csv with f-strings for change file names:
for f in filenames:
df = pd.read_csv(f, names=['Date','Signal','Data','Code'], index_col=None)
#this is what I should add to all csv files
df["ID"] = df["Data"].str.slice(0,2)
#python 3.6+
df.to_csv(f'{f[:-4]}edited.csv', index=False)
#python bellow 3.6
#df.to_csv('{}edited.csv'.format(f[:-4]), index=False)

Concatenate csv files in python by ascending order of filenames

I need to concatenate csv files with same column headers in python. The csv files with the following filenames should concatenate in order as shown below(ascending order of filename):
AB201602.csv
AB201603.csv
AB201604.csv
AB201605.csv
AB201606.csv
AB201607.csv
AB201608.csv
AB201610.csv
AB201612.csv
I would like to keep the column headers only from first file. Any idea?
I tried to use the below code and it combined the csv file by random filenames and truncated half of the column header name. thanks
csvfiles = glob.glob('/home/c/*.csv')
wf = csv.writer(open('/home/c/output.csv','wb'),delimiter = ',')
for files in csvfiles:
rd = csv.reader(open(files,'r'),delimiter = ',')
rd.next()
for row in rd:
print(row)
wf.writerow(row)
Using #Gokul comment and pandas.
import pandas as pd
import glob
csvfiles = sorted(glob.glob('/home/c/*.csv'))
df = pd.DataFrame()
for files in csvfiles:
df = df.append(pd.read_csv(files))
df.to_csv('newfile.csv')

Reading multiple csv files and writing it to another csv file

I have this code
import pandas as p
import csv
df = p.read_csv('interview1.csv')
df2 = df[['Participant', 'Translation']] # selects two of the columns in your file
df2.to_csv('out.csv')
How do i read multiple files and then write to 'out.csv'. So basically, instead of reading only interview1, i read interview2, interview3 to interview7 as well into the out.csv
Simply open the output file in append mode:
import pandas as p
import csv
csv_list=['interview1.csv', 'interview2.csv', ...]
for itw in csv_list:
df = p.read_csv(itw)
df.to_csv('out.csv', mode='a')
Use this to read all .CSV data from a folder and combined it together
import pandas as pd
import glob
import os
path = r'file path'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df.to_csv("combined-data_new.csv")

Categories

Resources