Pandas loop through sheets without sheet name - python

I have a couple of excel files with different sheet names
Was wondering if its possible to iterate through all of the sheet names without giving a variable "sheetname"
Right now it is through input selection.. I have tried to search before posting but haven't found or figured out how to work this out, help is appreciated.
import pandas as pd
# Look for sheet names
file = file_name
df = pd.ExcelFile(file).sheet_names
# Filter sheets
counter = 0
sheets = [""]
for sheet in df:
if sheet[0] == "Δ" or sheet == "Log Data":
pass
else:
counter += 1
sheets.append(sheet)
print(f"{sheets[counter]} - {counter}")
# Sheet selection
try:
x = int(input("Select Sheet Number: "))
assert x in range(1, counter + 1), "Select a value from list"
except ValueError as err:
logger.error(err)
raise
else:
df = pd.read_excel(f"{file}", f"{sheets[x]}")
finally:
print(f"{sheets[x]} Selected")

Related

Splitting Excel Data by Groupings into Separate Workbook Sheets

Background:I have a large 40MB XLSX file that contains client data which is Grouped over multiple levels, like so:
Expanded -
Not Expanded (sorry about the terrible dummy data!) -
Objective:I would like to split Client A, B C etc... and all their respective underlying data into separate sheets (named 'Client A' etc...) in a Workbook.
Question:Am I correct in assuming that there is no python library that would help with this (e.g., xlsxwriter) and that I will likely have to save into multiple pandas df before splitting and writing to the xlsx file?
Sample Data:Here is a link to some randomized sample data. In this file you will see only 1 client (the total row can be ignored) however imagine the normal file having 40 clients / groupings and sub levels.
Sample Code: this function takes the '.xlsxand writes each grouping to an appropriately named tab (e.g., 'Client A') to a separate Worksheet in a new.xlsx`. The issue with this code is that because I am basically going through and copying each cell individually, I didn't think to consider more holistically however to ensure the Groupings/Levels would be preserved. I think this code needs a complete re-write, and welcome feedback
import openpyxl
from copy import copy
from openpyxl import load_workbook
columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
for col in columns:
ws_cell=ws1[col+str(ws_row)]
new_cell = ws[col+str(row)]
if ws_cell.has_style:
new_cell.font = copy(ws_cell.font)
new_cell.border = copy(ws_cell.border)
new_cell.fill = copy(ws_cell.fill)
new_cell.number_format = copy(ws_cell.number_format)
new_cell.protection = copy(ws_cell.protection)
new_cell.alignment = copy(ws_cell.alignment)
wb1 = openpyxl.load_workbook('annonamized_test_data_to_be_split.xlsx')
ws1=wb1.active
indexs=[]
clients=[]
index=1
while ws1['A'+str(index)]:
if str(ws1['A'+str(index)].alignment.indent)=='0.0':
indexs.append(index)
clients.append(ws1['A'+str(index)].value)
if ws1['A'+str(index)].value is None:
indexs.append(index)
break
index+=1
wb1.close()
wb = openpyxl.Workbook()
ws=wb.active
start_index=1
headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']
for y,index in enumerate(indexs):
try:
client=0
if len(clients[y])>=32:
client=clients[y][:31]
else:
client=clients[y]
wb.create_sheet(client)
ws=wb[client]
ws.column_dimensions['A'].width=35
ws.append(headers)
row_index=2
for i in range(start_index,indexs[y+1]):
ws.append([ws1[col+str(i)].value for col in columns])
copy_cell(ws,row_index,i,ws1)
row_index+=1
start_index=indexs[y+1]
except:
pass
wb.save('split_data.xlsx')
wb.close()
try:
wb1 = openpyxl.load_workbook('split_data.xlsx')
a=wb1['Sheet']
wb1.remove(a)
a=wb1['Sheet1']
wb1.remove(a)
wb1.save('split_data.xlsx')
wb1.close()
except:
pass
Please can someone point me in the right direction of a resource that might teach me how to achieve this?
from openpyxl import load_workbook
def get_client_rows(sheet):
"""Get client rows.
Skip header and then look for row dimensions without outline level
"""
return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
return [
row_index
for row_index, row_dimension in sheet.row_dimensions.items()
if row_index > 1 and row_dimension.outline_level == 0
]
def delete_client_block(sheet, start, end):
"""
Delete rows starting from up to and including end.
"""
for row in range(start, end + 1):
sheet.row_dimensions.pop(row, None)
sheet.delete_rows(start, end - start + 1)
def split_workbook(input_file, output_file):
"""
Split workbook each main group into its own sheet.
Not too loose any formatting we copy the current sheet and remove all rows
which do not belong to extacted group.
"""
try:
workbook = load_workbook(input_file)
data_sheet = workbook.active
client_rows = get_client_rows(data_sheet)
for index, client_row in enumerate(client_rows):
# create new sheet for given client, shorten client as it might be too long
client_sheet = workbook.copy_worksheet(data_sheet)
client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
# delete rows after current client if available
if index < len(client_rows) - 1:
row_after_client = client_rows[index + 1]
delete_client_block(
client_sheet, row_after_client, client_sheet.max_row
)
# delete rows before current client if available
if index > 0:
first_client_row = client_rows[0]
delete_client_block(
client_sheet, first_client_row, client_row - first_client_row + 1
)
# move left over dimensions to top of the sheet
for row_index in list(client_sheet.row_dimensions.keys()):
# skip header row dimension
if row_index > first_client_row - 1:
row_dimension = client_sheet.row_dimensions.pop(row_index)
new_index = row_index - client_row + first_client_row
row_dimension.index = new_index
client_sheet.row_dimensions[new_index] = row_dimension
del workbook[data_sheet.title]
workbook.save(output_file)
finally:
workbook.close()
if __name__ == "__main__":
# input_file = "annonamized_test_data_to_be_split.xlsx"
input_file = 'partial_Q1_Client_Billing_Data.xlsx'
# output_file = "split_data.xlsx"
output_file = "splitting_full_data.xlsx"
split_workbook(input_file, output_file)

Python KeyError when Comparing two DataFrames

I have extracted data from a CSV file and want to use it update values in a spreadsheet. The dataframe has a column 'ticker' of values. I want to check the existing values in the spreadsheet, and if the CSV has a new value add the new value to the spreadsheet.
if df_xls.empty:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[0]]]}), ignore_index=True)
wtw = 1
print(reduced.columns.values)
print(df_xls.columns.values)
for csv_row in reduced:
for xls_row in df_xls:
if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:
wtw = 0
break
else:
next(xls_row)
if wtw == 1:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[csv_row]]]}), ignore_index=True)
next(csv_row)
I am getting a "KeyError: 'ticker'" in reference to the line "if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:" I don't understand the error given the column names are correct. The print outputs above display:
['ticker' '2021-02-01 shares' '2021-02-01 value']
['ticker']
Thanks in advance.
Edit --
I do not have the code available at a URL, but here is the entirety of the script:
import numpy as np
import pandas as pd
filename = "2021-02-01-FULLREPORT.csv"
##load new information from CSV into dataframe##
df_csv = pd.read_csv(filename)
prefix = filename[0:10]
ticker = df_csv.ticker
shares = df_csv.shares
value = df_csv["market value($)"]
reduced = pd.DataFrame({
"ticker": ticker,
prefix +" shares": shares,
prefix +" value": value
})
##end load new information from CSV into dataframe##
##load excel
from pandas import ExcelWriter
from pandas import ExcelFile
df_xls = pd.read_excel('file.xlsx')
##update ticker list with information saved in reduced##
if df_xls.empty:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[0]]]}), ignore_index=True)
wtw = 1
print(reduced.columns.values)
print(df_xls.columns.values)
for csv_row in reduced:
for xls_row in df_xls:
if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:
wtw = 0
break
else:
next(xls_row)
if wtw == 1:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[csv_row]]]}), ignore_index=True)
next(csv_row)
print (df_xls)
This resolved the KeyError:
##load excel
from pandas import ExcelWriter
from pandas import ExcelFile
df_xls = pd.read_excel('file.xlsx')
##update ticker list saved in reduced##
for csv_row in reduced.index:
wtw = 1
print ("testing " + reduced.ticker[csv_row])
for xls_row in df_xls.index:
print ("comparing "+ reduced.ticker[csv_row] + "with ")
print (df_xls.ticker[xls_row])
if reduced.ticker[csv_row] == df_xls.ticker[xls_row]:
print ("match found")
wtw = 0
break
if wtw == 1:
df_xls = df_xls.append(pd.DataFrame({"ticker": [[reduced.ticker[csv_row]]]}), ignore_index=True)
print (df_xls)

Pandas not updating CSV

Dataset:
https://github.com/Bene939/newsheadlinedatasets
With my program I am labeling my dataset of news headlines. It worked fine until today.
For some reason it won't write the csv file anymore. As far as I can see the data frame gets updated though.
At around 4469 rows of my csv it started to not overwrite the csv file. And then it did. And then didnt do it again until it stopped overwriting completely at row 4474. It worked fine until now and if I create a new csv it will overwrite it.
I am using Jupyter Notebook. Is there some kind of limit to this? The labeled dataset is around 300KB.
!pip install pandas
!pip install pathlib
import pandas as pd
from pathlib import Path
#takes data frame and file name & appends it to given csv
def append_df(df, file_name):
my_file = Path(file_name)
if my_file.exists():
print("Appending to existing file named " + file_name)
orig_df = pd.read_csv(file_name)
print("Old Data Frame: ")
print(orig_df)
new_df = pd.concat([orig_df, df], ignore_index=True).drop_duplicates()
print("New Data Frame: ")
print(new_df)
new_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
else:
print("Creating new file named" + file_name)
news_sentiment_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
#takes data frame and file name & overwrites given csv
def update_csv(df, file_name):
print("Overwriting " + file_name)
df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
#shows sentence by sentence, labels it according to input and saves it in a new csv file
print("WARNING: EDITING CSV FILE WITH EXCEL MAY CORRUPT FILE\n")
file_name = "news_headlines.csv"
new_file = "news_headlines_sentiment.csv"
news_sentiment_df = pd.DataFrame(columns=["news", "sentiment"])
my_file = Path(file_name)
if my_file.exists():
df = pd.read_csv(file_name, encoding='utf-8-sig', error_bad_lines=False)
print("Loaded " + file_name)
for index, row in df.iterrows():
user_input = -1
range = [0, 1, 2]
while user_input not in range:
print("####################################################################")
print(row["news"])
try:
user_input = int(input("Negative: 0\nNeutral: 1\nPositive: 2\n"))
except ValueError as err:
print("\nPlease enter an Integer!\n")
pass
new_element = 0
#label sentiment according to input
if user_input == 0:
new_element = [row["news"], 0]
elif user_input == 1:
new_element = [row["news"], 1]
elif user_input == 2:
new_element = [row["news"], 2]
#save labeled sentence to new file
news_sentiment_df.loc[len(news_sentiment_df)] = new_element
append_df(news_sentiment_df, new_file)
#delete data point from original data frame
index_name = df[df["news"] == row["news"]].index
df.drop(index_name, inplace=True)
#update old csv file
update_csv(df, file_name)
else:
print("File not Found")
I was trying to add duplicates while using drop_duplicates function without noticing it

Split a dataframe into multiple data set based on condition and each sub set into Excel

Can someone please help me here? I do not get any output and I do not get either an error message. I am trying to filter a dataframe into multiple sub set using customer conditions and paste each sub set into Excel worksheets.
Master_data(df) Output A Output B
import pandas as pd
import os
## Belgium\2020\GMC Prep Automation")
from openpyxl import load_workbook
import xlsxwriter
from shutil import copyfile
file = input("please enter excelfile: ")
extension = os.path.splitext(file)[1]
filename = os.path.splitext(file)[0]
pth = "\\we.interbrew.net\\DFSEurope\\Crown Jewels\\Revenue Management\\WEST\\2. BE\\4. MPM Belgium\\2020\\GMC Prep Automation"
newfile = os.path.join(pth, filename+"_2"+extension)
#myfile = os.path.join(pth, Split_Test.xlsx)
df = pd.read_excel(file)
colpick = input("enter column to be splitted: ")
col = list(set(df[colpick].values))
def sendtoexcel(col):
copyfile(file, newfile)
for j in col:
writer = pd.ExcelWriter(newfile,engine='openpyxl')
for myname in col:
mydf=df.loc[df[colpick] == myname]
mydf.to_excel(writer,sheet_name=myname,index=False)
writer.save()
print("\nCompleted")
return
Assuming user inputs correct file names and existing column, consider groupby run and not a double for loop on the same column. Code is wrapped in try/except in case user enters an incorrect column name or some issue with exporting data fame to Excel.
from openpyxl import load_workbook
...
colpick = input("enter column to be splitted: ")
colpick = colpick.title().strip()
def sendtoexcel():
try:
with pd.ExcelWriter(file, engine='openpyxl') as writer:
writer.book = load_workbook(file)
for i, sub in df.groupby([colpick]):
sub.to_excel(writer, sheet_name=i, index=False)
writer.save()
except Exception as e:
print(e)
# ACTUALLY RUN FUNCTION
sendtoexcel()

updating excel using python

I have a workbook with multiple sheets in it. I am trying to read one sheet data and match to column fields in other sheet to see if they match update some column for that sheet. This is what I was trying. But as I understand XLRD can't be used to write. Can anyone point me to python library or module which can do both read and write at sam time:
`#!/usr/bin/python
import xlrd, xlwt
workbook = xlrd.open_workbook('nagios.xlsx')
workbook1 = xlwt.Workbook()
worksheet1 = workbook.sheet_by_name('contacts_users')
worksheet2 = workbook.sheet_by_name('contact_group_nagios')
for row in range(1, worksheet2.nrows):
print "value: ", worksheet2.cell(row,0).value
print "value: ", worksheet2.cell(row,1).value
s = worksheet2.cell(row,1).value
grp_name = worksheet2.cell(row,0).value
members = s.split(",")
for member in members:
for row1 in range(1, worksheet1.nrows):
if member == worksheet1.cell(row1,0).value:
s1 = worksheet1.cell(row1,3).value
s1 += grp_name
worksheet1.append(row1,3, s1)`

Categories

Resources