Write Dataframe row to excel sheet using Pandas - python

How do I save returned row from dataframe into excel sheet?
Story: Am working with large txt file (1.7M rows), containing postal codes for Canada. I created a dataframe, and extracted values I need into it. One column of the dataframe is the province id (df['PID']). I created a list of the unique values found in that PID column, and am successfully creating the (13) sheets, each named after the unique PID, in a new excel spread sheet.
Problem: Each sheet only contains the headers, and not the values of the row.
I am having trouble writing the matching row to the sheet. Here is my code:
import pandas as pd
# parse text file into dataframe
path = 'the_file.txt'
df = pd.read_csv(path, sep='\t', header=None, names=['ORIG', 'PID','PCODE'], encoding='iso-8859-1')
# extract characters to fill values
df['ORIG'] = df['ORIG']
df['PID'] = df['ORIG'].str[11:13].astype(int)
df['PCODE'] = df['ORIG'].str[:6]
# create list of unique province ID's
prov_ids = df['PID'].unique().tolist()
prov_ids_string = map(str, prov_ids)
# create new excel file
writer = pd.ExcelWriter('CanData.xlsx', engine='xlsxwriter')
for id in prov_ids_string:
mydf = df.loc[df.PID==id]
# NEED TO WRITE VALUES FROM ROW INTO SHEET HERE*
mydf.to_excel(writer, sheet_name=id)
writer.save()
I know where the writing should happen, but I haven't gotten the correct result. How can I write only the rows which have matching PID's to their respective sheets?
Thank you

The following should work:
import pandas as pd
import xlsxwriter
# parse text file into dataframe
# extract characters to fill values
df['ORIG'] = df['ORIG']
df['PID'] = df['ORIG'].str[11:13].astype(int)
df['PCODE'] = df['ORIG'].str[:6]
# create list of unique province ID's
prov_ids = df['PID'].unique().tolist()
#prov_ids_string = map(str, prov_ids)
# create new excel file
writer = pd.ExcelWriter('./CanData.xlsx', engine='xlsxwriter')
for idx in prov_ids:
mydf = df.loc[df.PID==idx]
# NEED TO WRITE VALUES FROM ROW INTO SHEET HERE*
mydf.to_excel(writer, sheet_name=str(idx))
writer.save()
For example data:
df = pd.DataFrame()
df['ORIG'] = ['aaaaaa111111111111111111111',
'bbbbbb2222222222222222222222']
df['ORIG'] = df['ORIG']
df['PID'] = df['ORIG'].str[11:13].astype(int)
df['PCODE'] = df['ORIG'].str[:6]
print(df)
In my Sheet 11, I have:
Kr.

Related

How to copy tables from a pdf file to excel file, except the headers using python

I have extracted Tables from a pdf file to an excel(xlsx) file using python. Now I want All the data except the headers to appear in the excel file. What changes should I make to the code. I am attaching the code below for you.
The code:-
import camelot
import PyPDF2
import pandas as pd
# PDF file to extract tables from
file = "C:/Users/mahma_dv2pq9y/Downloads/santander.pdf"
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('C:/Users/mahma_dv2pq9y/OneDrive/santander_agg_mortgage.xlsx', engine='xlsxwriter')
# extract all the tables in the PDF file
tables = camelot.read_pdf(file, pages='all',flavor="stream" , encoding="utf-8")
except('Maximum loan to value, Initial rate, Differential to BBR, Product fee, Completion deadline, Minimum loan size, Maximum loan size, Early repayment
charge plus Benefit package, Payable if you repay on or before, product code')
#number of tables extracted
print("Total tables extracted:", tables.n)
# print the first table as Pandas DataFrame
#print(tables[1].df)
# export individually as Excel (.xlsx extension)
#tables[1].to_excel("/mnt/projetcs/pdf_excel/agg.xlsx")
columns = ['Additional Info']
for i in range (1, tables.n):
#print(tables[i].df)
#tables[i].df.to_excel(writer, sheet_name='Sheet'+str(i), index=False)
temp_df = tables[i].df
#temp_df.rename(columns=temp_df.iloc[0]).drop(temp_df.index[0])
#temp_df.drop(columns, inplace=True, axis=1,errors='ignore')
# iterating the columns
for col in temp_df.columns:
#print(temp_df.iloc[:, [col]])
if(col>7):
print(col)
#print(temp_df.drop(temp_df.iloc[:, [col]], axis=1,errors='ignore'))
#print(temp_df)
col_length=len(temp_df. columns);
print("count"+str(col_length))
if(col_length > 6):
print("save")
temp_df.to_excel(writer, sheet_name='Sheet', index=False)
writer.save()
I have no Santander Bank Statement to test, but I am almost sure you will manage to adjust it to your needs:
import camelot
import pandas as pd
# PDF file to extract tables from
file = r"C:/Users/mahma_dv2pq9y/Downloads/santander.pdf"
# extract all the tables in the PDF file
tables = camelot.read_pdf(file, pages='all', flavor="stream", encoding="utf-8")
master_DF = pd.DataFrame() # Creates empty dataframe to update it with data later
for i in range(tables.n):
if i == 0: # if table is from the firs Statement page to below:
new_header = tables[i].df.iloc[4] # choose row with headers from tables[0].df (header's names to list)
tables[i].df = tables[i].df[5:] # read only data after the header row (top rows usually have lots bank account related info which is not needed)
tables[i].df.columns = new_header # rename empty headers using list
master_DF = pd.concat([master_DF, tables[i].df], axis=0, ignore_index=True) # append empty DataFrame with data
else:
tables[i].df = tables[i].df[1:] # reading DataFrame without top row (usually bank Statement has Info/Logo row)
tables[i].df.columns = new_header # rename empty headers using list
master_DF = pd.concat([master_DF, tables[i].df], axis=0, ignore_index=True) # appending data
print(tables[i].df)
print("")
float_cols = ['Debit', 'Credit', 'Balance'] # list of columns with numeric data (float)
for col in float_cols:
master_DF[col]=master_DF[col].str.replace(",","") # striping out commas before datatype conversion
master_DF[float_cols] = master_DF[float_cols].apply(pd.to_numeric) # datatype conversion: strings converted to numeric
master_DF.to_excel(r"C:/Users/mahma_dv2pq9y/OneDrive/NEW_santander_agg_mortgage.xlsx", index=False, sheet_name="BS_Statement")
print("DATA Saved")

python add multiple columns to excel file one by one

I have an excel file with multiple sheet, I want to group the columns in the logs list and store them in another excel file, but some sheets do not contain some columns, so tf a column does not exist in a sheet don't store it, the code work well but it store only the last column
import pandas as pd
sheets_names = ['R9_14062021','R9_02122020','R9_14062021','R9_28052021','R9_17052021','R9_03052021','R9_14042021','R9_24032020','R9_19032020','R9 30112020','R9_17112020','R7_27012021','LOGS R9 01032021','LOGS R7 SAT01032021','R7_30032020','G9_06032020','G5T_20012021','TNT_08122020','R7_SAT_24112020','G6T_12112020','R9 12102020']
logs = [' Msd','Provider Id','Terminal Type','chgtCh','accessRecordModule','playerPlay startOver','playerPlay PdL','playerPlay PVR','contentHasAds','pdlComplete','lirePdl','lireVod']
dfs_list = pd.read_excel('COMPIL LOGS INDICATEURS V14062021.xlsx',sheet_name = sheets_names )
writer = pd.ExcelWriter('pandas_multiple.xlsx', engine='xlsxwriter')
for sheet in dfs_list:
df = dfs_list[sheet]
df['Dt'] = pd.to_datetime(df['Dt']).dt.date
df1 = df.groupby(['Dt','webApp','mw'])[' Msd'].count()
for log in logs:
if log in df:
df1 = df.groupby(['Dt','webApp','mw'])[log].sum()
df1.to_update.get(sheet)
#df1.reset_index(inplace=True)
df1.to_excel(writer, sheet_name=sheet)
writer.save()
result:

How to obtain the mean of selected columns from multiple sheets within same Excel File

I am working with a large excel file having 22 sheets, where each sheet has the same coulmn headings but do not have equal number of rows. I would like to obtain the mean values (excluding zeros) for columns AA to AX for all the 22 sheets. The columns have titles which I use in my code.
Rather than reading each sheet, I want to loop through the sheets and get as output the mean values.
With help from answers to other posts, I have this:
import pandas as pd
xls = pd.ExcelFile('myexcelfile.xlsx')
xls.sheet_names
#print(xls.sheet_names)
out_df = pd.DataFrame()
for sheets in xls.sheet_names:
df = pd.read_excel('myexcelfile.xlsx', sheet_names= None)
df1= df[df[:]!=0]
df2=df1.loc[:,'aa':'ax'].mean()
out_df.append(df2) ## This will append rows of one dataframe to another(just like your expected output)
print(out_df2)
## out_df will have data from all the sheets
The code works so far, but only one of the sheets. How do I get it to work for all 22 sheets?
You can use numpy to perform basic math on pandas.DataFrame or pandas.Series
take a look at my code below
import pandas as pd, numpy as np
XL_PATH = r'C:\Users\YourName\PythonProject\Book1.xlsx'
xlFile = pd.ExcelFile(XL_PATH)
xlSheetNames = xlFile.sheet_names
dfList = [] # variable to store all DataFrame
for shName in xlSheetNames:
df = pd.read_excel(XL_PATH, sheet_name=shName) # read sheet X as DataFrame
dfList.append(df) # put DataFrame into a list
for df in dfList:
print(df)
dfAverage = np.average(df) # use numpy to get DataFrame average
print(dfAverage)
#Try code below
import pandas as pd, numpy as np, os
XL_PATH = "YOUR EXCEL FULL PATH"
SH_NAMES = "WILL CONTAINS LIST OF EXCEL SHEET NAME"
DF_DICT = {} """WILL CONTAINS DICTIONARY OF DATAFRAME"""
def readExcel():
if not os.path.isfile(XL_PATH): return FileNotFoundError
SH_NAMES = pd.ExcelFile(XL_PATH).sheet_names
# pandas.read_excel() have argument 'sheet_name'
# when you put a list to 'sheet_name' argument
# pandas will return dictionary of dataframe with sheet_name as keys
DF_DICT = pd.read_excel(XL_PATH, sheet_name=SH_NAMES)
return SH_NAMES, DF_DICT
#Now you have DF_DICT that contains all DataFrame for each sheet in excel
#Next step is to append all rows data from Sheet1 to SheetX
#This will only works if you have same column for all DataFrame
def appendAllSheets():
dfAp = pd.DataFrame()
for dict in DF_DICT:
df = DF_DICT[dict]
dfAp = pd.DataFrame.append(self=dfAp, other=df)
return dfAp
#you can now call the function as below:
dfWithAllData = appendAllSheets()
#now you have one DataFrame with all rows combine from Sheet1 to SheetX
#you can fixed the data, for example to drop all rows which contain '0'
dfZero_Removed = dfWithAllData[[dfWithAllData['Column_Name'] != 0]]
dfNA_removed = dfWithAllData[not[pd.isna(dfWithAllData['Column_Name'])]]
#last step, to find average or other math operation
#just let numpy do the job
average_of_all_1 = np.average(dfZero_Removed)
average_of_all_2 = np.average(dfNA_Removed)
#show result
#this will show the average of all
#rows of data from Sheet1 to SheetX from your target Excel File
print(average_of_all_1, average_of_all_2)

Is there any way to copy a particular column from excel sheet (say sheet_1) to the other column which is in sheet_2? Using Python

Please find the code below:
import pandas as pd
import csv
# Reading the csv file
df_new = pd.read_csv('source.csv')
# saving xlsx file
GFG = pd.ExcelWriter('source.xlsx')
df_new.to_excel(GFG, index = False)
GFG.save()
# read excel
xl = pd.ExcelFile("source.xlsx")
df = xl.parse("Sheet1")
# get the column you want to copy
column = df["Marks"]
# paste it in the new excel file
with pd.ExcelWriter('Target_excel.xlsx', mode='A') as writer:
column.to_excel(writer, sheet_name= "new sheet name", index = False)
writer.close()
In this code, it is replacing the existing contents of the target excel file.
I want to update a column in sheet 2 without changing other columns.
Example:
Excel file 1--> column_name = 'Marks'
Marks = 10,20,30
Excel file 2--> there are two columns present in this file
Subject_name = Math, English, Science
Marks = 50, 20, 40
So I want to copy "Marks" column from Excel file 1 and paste it into "Marks" column of Excel file 2(Without changing the data of "Subject" column)
import pandas as pd
import openpyxl as pxl
def get_col_idx(worksheet, col_name):
return next((i for i, col in enumerate(worksheet.iter_cols(1, worksheet.max_column)) if col[0].value == col_name), -1)
### ----- 0. csv -> xlsx (no change from your code)
df_new = pd.read_csv("source.csv")
GFG = pd.ExcelWriter("source.xlsx")
df_new.to_excel(GFG, index=False)
GFG.save()
### ----- 1. getting data to copy
# open file and get sheet of interest
source_workbook = pxl.load_workbook("source.xlsx")
source_sheet = source_workbook["Sheet1"]
# get "Marks" column index
col_idx = get_col_idx(source_sheet, "Marks")
# get contents in each cell
col_contents = [row[col_idx].value for row in source_sheet.iter_rows(min_row=2)]
### ----- 2. copy contents to target excel file
target_workbook = pxl.load_workbook("Target_excel.xlsx")
target_sheet = target_workbook["new sheet name"]
col_idx = get_col_idx(target_sheet, "Marks")
for i, value in enumerate(col_contents):
cell = target_sheet.cell(row=i+2, column=col_idx+1)
cell.value = value
target_workbook.save("Target_excel.xlsx")

dropping columns in multiple excel spreedsheets

Is there a way in python i can drop columns in multiple excel files? i.e. i have a folder with several xlsx files. each file has about 5 columns (date, value, latitude, longitude, region). I want to drop all columns except date and value in each excel file.
Let's say you have a folder with multiple excel files:
from pathlib import Path
folder = Path('excel_files')
xlsx_only_files = list(folder.rglob('*.xlsx'))
def process_files(xls_file):
#stem is a method in pathlib
#that gets just the filename without the parent or the suffix
filename = xls_file.stem
#sheet = None ensure the data is read in as a dictionary
#this sets the sheetname as the key
#usecols allows you to read in only the relevant columns
df = pd.read_excel(xls_file, usecols = ['date','value'] ,sheet_name = None)
df_cleaned = [data.assign(sheetname=sheetname,
filename = filename)
for sheetname, data in df.items()
]
return df_cleaned
combo = [process_files(xlsx) for xlsx in xlsx_only_files]
final = pd.concat(combo, ignore_index = True)
Let me know how it goes
stem
I suggest you should define columns you want to keep as a list and then select as a new dataframe.
# after open excel file as
df = pd.read_excel(...)
keep_cols = ['date', 'value']
df = df[keep_cols] # keep only selected columns it will return df as dataframe
df.to_excel(...)

Categories

Resources