I would like to copy the data from multiple excel files and paste it existing excel file.
import os
import openpyxl
import pandas as pd
my_path = r'C://Users//greencolor//Autoreport//Load_attachments//' # Updating every monday with new files
my_path1 =r'C://Users//greencolor//Desktop//Autoreport//' # place of Master.xlsx
for filename in os.listdir(my_path): #loop though the downloaded files
if filename.startswith('PB orders Dec'): #finds the file with the name PB orders December.xlsb
dec = pd.read_excel(os.path.join(my_path, filename),
sheet_name='Raw data ',
engine='pyxlsb') #reading the specific sheet named Raw data
with pd.ExcelWriter(my_path1 + '//Maaster.xlsx', mode='a', engine=openpyxl) as writer:
dec.to_excel(writer, sheet_name='DecData', index=False) #copyes the data from PB orders December.xlsb and paste it in Master file on sheet Decdata without deleting other sheets. Basically updating all data with new data.
My problem is that I would like to do the same above mentioned operation on multiple files. For example, if in the Load_attachments is the file named PB orders November.xlsb I want to apply the same code to that November file and so on what ever month name file has.
You can put the entire thing (minus imports) in a for loop and feed it with an array of filename beginnings:
import os
import openpyxl
import pandas as pd
fnamebegin_array = ['PB orders Jan','PB orders Feb','PB orders Mar','etc']
sheetname_array = ['JanData','FebData','MarData','etc']
for i in range(len(fnamebegin_array)): # would be 12, if there is data for each month in a year
my_path = r'C://Users//greencolor//Autoreport//Load_attachments//'
my_path1 =r'C://Users//greencolor//Desktop//Autoreport//'
for filename in os.listdir(my_path):
if filename.startswith(fnamebegin_array[i]):
month = pd.read_excel(os.path.join(my_path, filename),
sheet_name='Raw data ',
engine='pyxlsb')
with pd.ExcelWriter(my_path1 + '//Master.xlsx', mode='a', engine=openpyxl) as writer:
month.to_excel(writer, sheet_name=sheetname_array[i], index=False)
Related
I need help with my Python code.
The goal is:
read in between 100 and 200 CSV files that are in a folder
copy a variable in each CSV file from position (2,2)
create the sum of all values of column 17 in every CSV
to transfer the values in the form of a dataframe
create a new Excel file
transfer the dataframe in the Excel file
My attempt was the following code
# import necessary libraries
import pandas as pd
import os
import glob
# use glob to get all the csv files
# in the folder
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "*.csv"))
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f,sep=';', skiprows=2,usecols=[2,16],header=None)
#ID
ID = (df.loc[2][2])
#summ of col.16
dat_Verbr = df[16].sum()
# data in single dataframe
df4 = pd.DataFrame({'SIM-Karte': ID, 'Datenverbrauch': dat_Verbr}, index=[0,1,2,3,4,5])
# Specify the name of the excel file
file_name = 'Auswertung.xlsx'
# saving the excelsheet
concatenated.to_excel(file_name)
print(' record successfully exported into Excel File')
unfortunately, it doesn't work.
Problem is that only the first id and first sum are imported in the excel file.
How can I work with the index by creating a single dataframe. I don’t know the exact number of csv files, only somewhat between 100 and 200.
I'm a beginner with python.
Can someone help me please?
You can use the updated code. One assumption I made is that there is data in all rows 1 thru 16. If your file has just ;;;;... in the first row, read_csv sometimes makes a mistake. Also, as you are using skiprow = 1, it will not add the value in row 1, column 17 if present. You can need to change the code if that needs to be added. Rest, I have corrected/changed so the code works. Note that in to_excel I have used index=False as I didnt think you need the index to be added. Remove if you want to see the index as well.
# use glob to get all the csv files
# in the folder
import os, glob
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "*.csv"))
# data in single dataframe
df4 = pd.DataFrame(columns =['SIM-Karte', 'Datenverbrauch'])
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f,sep=';', skiprows = 1, usecols=[1,16],header=None)
#ID
ID = (df.iloc[0][1])
#summ of col.16
dat_Verbr = df[16].sum()
df4.loc[len(df4.index)] = [ID, dat_Verbr]
# Specify the name of the excel file
file_name = 'Auswertung.xlsx'
# saving the excelsheet
df4.to_excel(file_name, index=False)
print(' record successfully exported into Excel File')
Output excel - I had 3 files in the folder
I am trying to pull from a list of spreadsheets a specific sheet and specific columns within that sheet - think a monthly report that is structured similarly from month to month outside of a date stamp as part of the file name - i.e. Metrics 202001.xlsx and so on
I am using openpyxl which after a lot of trial and error is working great. My issue is I want to be able to write those specific columns to a dataframe or .xlsx for summary.
So I am looping through the workbooks and grabbing the sheet I want for each (thankfully all named the same). Where I am getting tripped up is pulling the specific columns and writing them. Here is my code thus far:
import os
import pandas as pd
import openpyxl
path = os.getcwd()
files = os.listdir(path)
print(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
print(files_xlsx)
Sp = pd.DataFrame() #make blank dataframe to fill in
headers = ["Fiscal Month", "Country", "Beginning Balance", "Acquisitions", "Reinstatements", "Terminations", "Delinq"] # fields I want to pull from worksheet within workbook
for f in files_xlsx :
wb = openpyxl.load_workbook(filename = f)
ws = wb['Metrics']
for col_cells in ws.iter_cols(min_col=2, max_col=2, max_row= ws.max_row+1):
for cell in col_cells:
I would like to dynamically fill the min_col and max_col values rather than hardcoding them. From there I would either write to the dataframe I created or a new excel file. any help would be greatly appreciated as I can see this code having more application than just the project I am working on. Thanks!
Seems like I figured this out. Thanks to the poster who answered a very similar question to this one!!
import os
import pandas as pd
path = os.getcwd()
files = os.listdir(path)
print(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f,"Sponsorship Metrics")
df = df.append(data)
I need to extract an Excel worksheet from multiple workbooks and saving it to a dataframe and in turn saving that dataframe.
I have a spreadsheet that is generated at the end of each month (e.g.
June 2019.xlsx, May 2019.xlsx, April 2019.xlsx).
I need to grab a worksheet 'Sheet1'from each of these workbooks and convert these to a dataframe (df1).
I would like to have this dataframe saved.
As a nice to have, I would also like some way just to append the next month's data after the initial 'data grab'.
I'm relatively new to this, so I haven't made much progress.
import os
import glob
import pandas as pd
import xlrd
import json
import io
import flatten_json
files = glob.glob('/Users/ngove/Documents/Python Scripts/2019/*.xlsx')
dfs={}
for f in files:
dfs[os.path.splitext(os.path.basename(f))[0]] = pd.read_excel(f)
You can drop all of your files in a directory (e.g. current directory). Then append all of your excel files in a list (e.g. files_xls). Iterate over all your files and use pandas.read_excel to get the respective dataframes (e.g. list_frames).
Below, you can find an example:
import os
import pandas as pd
path = os.getcwd() # get cur dir
files = os.listdir(path) # get all the files in your cur dir
# get only the xls or xlsm (this depends on you)
files_xls = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsm')]
df = pd.DataFrame()
list_frames = []
for f in files_xls:
print("Processing file: %s" %f)
try:
# the following will give you the dataframe
# the fun params depends on your data format
data = pd.read_excel(f, 'Sheet1', header=0, index_col=None,
sep='delimiter', error_bad_lines=False,
skip_blank_lines=True, comment=',,')
except:
pass
list_frames.append(data)
# at the end you can concat your data if you want and remove any dublicate
df = pd.concat(list_frames, sort=False).fillna(0)
df = df.drop_duplicates()
# at the end you can save it
writer = pd.ExcelWriter("your_title" + ".xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name="Sheets1", index=False)
writer.save()
I hope this helps.
I interpreted your statement that you want to save the dataframe as that you want to save it as a combined Excel file. This will combine all files in the folder specified that end in xlsx.
import os
import pandas as pd
from pandas import ExcelWriter
os.chdir("H:/Python/Reports/") #edit this to be your path
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
writer=ExcelWriter('Combined_Data.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
You could update the code to grab all 2019 files by changing the one line to this:
files_xlsx = [f for f in files if f[-9:] == '2019.xlsx']
I referenced this question for most of the code and updated for xlsx and added the file save portion of the code
Writing DataFrame to excel file leaves sheets with zero data.
I am creating a Robotics "Scouting application". It receives multiple .csv files throughout the coarse of two days. The csv files will be named with a four digit number plus a hyphen and then a match number. For example "2073-18.csv". Multiple files for each team will arrive. I need one sheet for each team with the content of each csv file on the same sheet for that team. Creating the sheets works, writing the data to these sheets doesn't.
import os
import glob
import csv
from xlsxwriter.workbook import Workbook
import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
from openpyxl import load_workbook
os.chdir ("/EagleScout")
path = '.'
extension = 'csv'
engine = 'xlsxwriter'
files_in_dir = [ f for f in glob.glob('*.csv')]
workbook = Workbook('Tournament.xlsx')
with pd.ExcelWriter('Tournament.xlsx') as writer:
for csvfile in files_in_dir:
df = pd.read_csv(csvfile)
fName, fExt = (os.path.splitext(csvfile))
sName = fName.split('-')
worksheet = workbook.get_worksheet_by_name(sName [0])
if worksheet is None:
worksheet = workbook.add_worksheet(sName [0]) #workseet with csv file name
df.to_excel(writer, sheet_name = (sName[0]))
writer.save()
workbook.close()
What I need is one workbook with one sheet for each team, up to 70 teams. Each sheet will have multiple rows, one for each csv file that arrived for that team. The question is, how do I get Pandas, or other libraries, to write the content of each csv file to it's appropriate sheet in the workbook?
OK, with the input from #ivan_pozdeev, I finally got past my issues.
Remember, my original desire was to generate a script that could be run on a regular basis and generate a spreadsheet with multiple worksheets. Each worksheet would contain all the data from the .csv files for every match that had played, and grouped by the team number.
I have also added a single spreadsheet that contains the raw data.
Here is what I came up with:
import os
import glob
import csv
import xlsxwriter
from xlsxwriter.workbook import Workbook
import pandas as pd
import numpy as np
#from sqlalchemy import create_engine
#import openpyxl
#from openpyxl import load_workbook
os.chdir ("/EagleScout")
path = '.'
extension = 'csv'
# Remove the combined .csv file from previous runs
#This will provide clean date without corruption from earlier runs
if os.path.exists('./Spreadsheets/combined.csv'):
os.remove ('./Spreadsheets/combined.csv')
#Remove previous Excel spreadsheet
if os.path.exists('./Spreadsheets/Tournament.xlsx'):
os.remove ('./Spreadsheets/Tournament.xlsx')
#Remove sorted combined csv
#Remove previous Excel spreadsheet
if os.path.exists('./Spreadsheets/Combined.xlsx'):
os.remove ('./Spreadsheets/Combined.xlsx')
#Read in and merge all .CSV file names
files_in_dir = [ f for f in glob.glob('*.csv')]
#Create a single combined .csv file with all data
#from all matches completed so far.
d1 = pd.read_csv('Header.txt')
d1.to_csv('./Spreadsheets/combined.csv', header = True, index = False)
for filenames in files_in_dir:
df = pd.read_csv(filenames)
fName, fExt = (os.path.splitext(filenames))
sName = fName.split('-')
N=(sName[1])
df.insert(0,N,N,True)
df.to_csv('./Spreadsheets/combined.csv', index_label = (sName[0]), mode = 'a')
#Combine all csv files into one master Raw Excel Data file
#and add column headers as labels
with pd.ExcelWriter('./Spreadsheets/Combined.xlsx') as writer:
dt = pd.read_csv('./Spreadsheets/combined.csv')
dt.to_excel(writer, sheet_name = 'All data')
writer.save()
#Parse through all .CSV files and append content to appropriate team worksheet.
with pd.ExcelWriter('./Spreadsheets/Tournament.xlsx') as writer:
df2 = pd.read_excel('./Spreadsheets/Combined.xlsx')
group = df2.groupby('Team')
for Team, Team_df in group:
Team_df.to_excel(writer, sheet_name = str(Team))
writer.save()
I am certain there is a cleaner way to do this code, I'm still new at this, but for now it does what I expect.
I'm new to pandas and python so ran into some trouble. I have a one large excel file which i need to divide into multiple worksheets using a python script. That i have to divide base on the ip addresses given in the data. I can't figure out how to do that and would appreciate some help and guidance.
I have no knowledge of working with python or any libraries before. This is what i did but created workbooks for each row.
import pandas as pd
df = pd.read_excel("D:/Users/Zakir/Desktop/MyNotebooks/Legacy.xls", sheet_name="Total", header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True)
writer = pd.ExcelWriter('D:/Users/Zakir/Desktop/MyNotebooks/pandas_simple.xlsx', engine='xlsxwriter')
for index, row in df.iterrows():
df1 = df.iloc[[index]]
df1.set_index('Number',inplace=True)
df1.to_excel(writer, sheet_name=row['IPAddress'])
writer.save()
This is the kind of excel file i have. over 5000 rows. There are 60 groups of ip addresses and have to divide each group into its own worksheet
one solution if you have enough memory:
from pandas import ExcelWriter
df = pd.read_excel('file',sheet_name="Total", header=0, #other settings.....#)
writer = ExcelWriter('E:/output.xlsx',engine='xlsxwriter')
print(df)
def writesheet(g):
a = g['IPAddress'].tolist()[0]
g.to_excel(writer, sheet_name=str(a), index=False)# index = True if you want to keep index
df.groupby('IPAddress').apply(writesheet)
writer.save()
This is how I implemented the code to check a folder, loop through all excel files and split each file by the values of a column name, which can be passed as input(vColName), assuming one sheet in the file:
import sys
import os, shutil
from os import listdir
from os.path import isfile, join
import pandas as pd
import urllib as ul
import datetime
import xlrd
#this method retrieves all the xlsx filenames from a folder
def find_excel_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
#this folder contains .xlsx files
filePath = "D:\files\sample\"
#there is a subfolder in my solution to move the processed files to
#and another subfolder to move the splitted output files
archivePath = os.path.join(filePath, "archive")
outPath = os.path.join(filePath, "output")
#get a list of filenames
fnames = find_excel_filenames(filePath)
#loop through each file
for fl in fnames:
vFile = os.path.join(filePath, fl)
#load the content of the file to a data frame,
#I open the file twice, first to get the number of columns and
#create the converter, then to open the file with string converter
#it helps with trimming of leading zeros
df = pd.read_excel(vFile, header=None)
column_list = []
for i in df:
column_list.append(i)
converter = {col: str for col in column_list}
df1 = pd.read_excel(vFile, converters=converter)
colValues=df1[vColName].unique().tolist()
for v in colValues:
filteredDF = df1.loc[df1[vColName]==v]
vOutFile = os.path.join(outPath, fl+''_''+v.replace("/"," ")+''.xlsx'')
writer = pd.ExcelWriter(vOutFile, engine=''xlsxwriter'')
# Convert the dataframe to an XlsxWriter Excel object.
filteredDF.to_excel(writer, sheet_name=''Sheet1'')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
#move the processed file to an archive folder
dst_file = os.path.join(archivePath, fl)
if os.path.exists(dst_file):
os.remove(dst_file)
shutil.move(vFile, archivePath)