Pandas and Tkinter , opening and saving files - python

I am trying to save the excel file as a .xlsx file then I want to import that new file back into python using pandas and numpy to allow for statistical analysis.
I want the USER to have the option of where to save the file with the 'asksaveasfile' and another dialog box 'askopenwhen opening the file for pandas and numpy.
Also, if someone can show how to convert specific columns from bytes to str. See below...
TypeError: write() argument must be str, not bytes.
Here's the end of the code:
import csv
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog
fn = filedialog.asksaveasfile(mode='w', defaultextension=".xlsx")
result = pdf_to_csv(fn)
lines = result.split('\n')
import openpyxl as pxl
wb = pxl.Workbook(fn)
ws = wb.active
for line in lines:
ws.append(line.split(';'))
# appending a list gives a complete row in xlsx
print("Successfully Saved! ")
root = tk.Tk()
root.withdraw()
dataFile=pd.read_excel(fn)#,usecols=['Last Name','First Name','Assignment Title','Department Code','Calendar Year', 'Compensation'])
dataFile.to_excel(fn)
print(fn)
df = fn
DataFrame = df
df1 = DataFrame
df1.columns = ['Last Name','First Name','Assignment Title','Department Code','Calendar Year', 'Compensation']
df1.drop(df1.index[0], inplace=True)
print(df1.head(11))

Related

Print only the rows which are Modified/Added/Removed in the excel sheet

I am trying to find the delta i.e. difference between the two json files
so, basically I am printing the new version of json file and old version of json files in the excel sheet creating two separate worksheet ,then comparing those two sheets to find the difference and storing the difference in another worksheet in the same excel sheet. But instead of only printing the row which are modified all the files are getting printed. Can somebody help me with this ?
Thanks in advance!
Code:-
import pandas as pd
from pathlib2 import Path
from xlsxwriter import *
from tkinter.font import BOLD
import ruamel.yaml
import os,json
import numpy as np
from tkinter import *
from tkinter import messagebox
from tkinter import ttk
from tkinter import filedialog
root=Tk()
root.resizable(False,False)
mywidth=550
myheight=460
selectfile=Label(root,text="File1")
selectfile1=Label(root,text="File2")
selectfile2=Label(root,text="save file")
selectfile.grid(row=1,column=1)
selectfile1.grid(row=2,column=1)
selectfile2.grid(row=3,column=1)
selectfileentry=Entry(root,width=40)
selectfileentry1=Entry(root,width=40)
selectfileentry2=Entry(root,width=40)
selectfileentry.grid(row=1,column=2)
selectfileentry1.grid(row=2,column=2)
selectfileentry2.grid(row=3,column=2)
def open_file1():
global filepath,filelist
filepath=filedialog.askdirectory(title="select a directory")
filelist=[]
for root,dirs,files in os.walk(filepath):
for file in files:
if file.endswith(".json"):
filelist.append(os.path.join(root,file))
selectfileentry.insert(0,filepath)
def open_file2():
global filepath2,filelist2
filepath2=filedialog.askdirectory(title="select a directory")
filelist2=[]
for root,dirs,files in os.walk(filepath2):
for file in files:
if file.endswith(".json"):
filelist2.append(os.path.join(root,file))
selectfileentry1.insert(0,filepath2)
def buttontoexcel():
global fvc
fvc=filedialog.asksaveasfilename(defaultextension=".xlsx",filetypes=[("excel files","*.xlsx")])
selectfileentry2.insert(0,fvc)
def add2excel():
dfs1=[]
writer=pd.ExcelWriter(fvc,engine="xlsxwriter")
for file in filelist:
with open(file) as f:
ab=json.load(f)
gg=ab["Indicator"]
json_data=pd.json_normalize(gg)
dfs1.append(json_data)
df=pd.concat(dfs1)
df.to_excel(writer,sheet_name="old",index=False)
dfs2=[]
for file in filelist2:
with open(file) as f:
ab=json.load(f)
gg=ab["Indicator"]
json_data=pd.json_normalize(gg)
dfs2.append(json_data)
df=pd.concat(dfs1)
df.to_excel(writer,sheet_name="new",index=False)
writer.save()
def excel_dif():
df_old=pd.read_excel(fvc,sheet_name=0,keep_default_na=False)
df_new=pd.read_excel(fvc,sheet_name=1,keep_default_na=False)
x=list((set(df_new.columns)|set(df_old.columns))-(set(df_old.columns)&set(df_new.columns)))
for g in x:
df_old[g]=''
key=['Name','NeType','Category']
df_old=df_old.set_index(key)
df_new=df_new.set_index(key)
dfDiff=df_new.copy()
droppedRows=[]
newRows=[]
modifiedRows=[]
cols_old=df_old.columns
cols_new=df_new.columns
sharedcols=list(set(cols_old).intersection(cols_new))
dfDiff.insert(0,"comment",'')
for row in dfDiff.index:
if(row in df_old.index)and(row in df_new.index):
for col in sharedcols:
value_old=df_old.loc[row,col]
value_new=df_new.loc[row,col]
if value_old==value_new:
df_new.loc[row,col]=df_new.loc[row,col]
else:
dfDiff.loc[row,col]=(value_new)
dfDiff.loc[row,'comment']="Modified"
modifiedRows.append(row)
else:
dfDiff.loc[row,'comment']='Added'
newRows.append(row)
for row in df_old.index:
if row not in df_new.index:
droppedRows.append(row)
dfDiff=dfDiff.append(df_old[row,:])
dfDiff.loc[row,'comment']='removed'
writer2=pd.ExcelWriter(fvc,engine='xlsxwriter')
df_old.to_excel(writer2,sheet_name="old",index=True)
df_new.to_excel(writer2,sheet_name="new",index=True)
dfDiff.to_excel(writer2,sheet_name="delta",index=True)
writer2.save()
ttk.Button(text="Browse",command=open_file1).grid(row=1,column=3)
ttk.Button(text="Browse",command=open_file2).grid(row=2,column=3)
ttk.Button(text="Browse",command=buttontoexcel).grid(row=3,column=3)
ttk.Button(text="Browse",command=lambda:[add2excel(),excel_dif()]).grid(row=4,column=3)
root.mainloop()

How can I combined multiple Excel files with different headers into one sheet using Pandas?

I have to combine several files (roughly 40), that have several variations of headers (ie, headers in different columns depending on the file, some files with a few column names that don't show up in others, etc.).
I have a python script that works to combined the files, but it simply puts them in the same order found in the original file. I want this script to be able to add a new column when a new column name shows up, and map all future occurrences of that column name with the respective row.
An example of my desired output is below, where the 'Gross Commission' and 'Payouts' columns only show up in the 2019 July file, and the '%' and '$' columns only show up in the 2018 June file (and all of the other columns showing up in both):
*Each file is for a different 'Period'.
Current Code:
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
in_path = filedialog.askdirectory()
listing = os.listdir(in_path)
files_xlsx = [f for f in listing if f[-4:] == 'xlsx']
df = pd.DataFrame()
for infile in listing:
file_data = pd.read_excel(in_path + '/' + infile,
header=0,
encoding = "ANSI")
df = df.append(file_data, sort=False)
out_path = in_path + ' Combined.xlsx'
writer = pd.ExcelWriter(out_path, engine='xlsxwriter')
df.to_excel(writer,
sheet_name='Combined',
index=False,
header=None)
writer.save()
Thank you for the help, and let me know if I can provide any more detail.
To get the headers to write to your xlsx file you can change header=True when you all df.to_excel().
The modified code is:
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
in_path = filedialog.askdirectory()
listing = os.listdir(in_path)
files_xlsx = [f for f in listing if f[-4:] == 'xlsx']
df = pd.DataFrame()
for infile in listing:
file_data = pd.read_excel(in_path + '/' + infile,
header=0,
encoding = "ANSI")
df = df.append(file_data, sort=False)
out_path = in_path + ' Combined.xlsx'
writer = pd.ExcelWriter(out_path, engine='xlsxwriter')
df.to_excel(writer,
sheet_name='Combined',
index=False,
header=True)
writer.save()
and gives me an output like this with some test data

How can i achieve a vlookup excel like functionality in Python

In Billing Roster - SOW.xlsx I have new column data one is named as SOW and other is named SOW Description (Match value for SOW).
And now when i open ACFC_Resource_Allocation.xlsx excel and for an example if select a value in D2 (SOW) cell from the dropdown i should get a matching value into E2 cell after the selection from dropdown.
I only have an idea than a vlookup from Excel like below should solve my case. Not sure how to achieve in python.
=VLOOKUP(D2,'[Billing Roster - SOW.xlsx]SOW List'!$A$1:$B$14,1,FALSE)
Tried below code
from openpyxl import *
from openpyxl.styles import *
import webbrowser
import pandas
from openpyxl.worksheet.datavalidation import DataValidation
# Read all Excels into pandas dataframes
sowexcel = pandas.read_excel('Billing Roster - SOW.xlsx')
#Load the existing Resource Allocation Excel
wb = load_workbook('ACFC_Resource_Allocation.xlsx')
allocationsheet = wb.active
def load():
maxrow = allocationsheet.max_row
sow_list = sowexcel['SOW #'].tolist()
column_sow = ','.join(sow_list)
validator_sow = DataValidation(type='list', formula1='"{}"'.format(column_sow), allow_blank=True)
allocationsheet.add_data_validation(validator_sow)
validator_sow.add('D2:D%s' %maxrow)
# save the file
wb.save('ACFC_Resource_Allocation.xlsx')
wb.close()
# Driver code
if __name__ == "__main__":
load()
file_open = webbrowser.open('ACFC_Resource_Allocation.xlsx')

Why am i only picking the first cell out of an excel sheet | Python 3

The point of the program is to feed a list of urls and get emails from the whois data
import whois
import xlrd
import tkinter
from tkinter import filedialog
from tkinter import *
import pandas as pd
#excel sheet pull
root = Tk()
root.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("xls files","*.xls"),("all files","*.*")))
workbookName = root.filename
workbook = xlrd.open_workbook(workbookName)
sheet = workbook.sheet_by_index(0)
'''
for site in sheet.cell:
whoisdataset = whois.whois(site)
for key in whoisdataset:
if "emails" in key:
enter code here`if "abuse" not in whoisdataset[key]:
print (key, whoisdataset[key])
'''
#sheet as matrix
df = pd.read_excel(workbookName)
df.as_matrix()
#Whois email pull
#if you replace "df" with a matrix, it will iterate and work
for website in df:
whoisdata = whois.whois(website)
for line in whoisdata:
if "emails" in line:
if "abuse" not in whoisdata[line]:
print (website, ":", line, whoisdata[line])
the #excel sheet pull works, and #whois email pull works if i feed it a 1 dimensional matrix, but #sheet as matrix is only giving me the first cell to work with, does anyone know how to make it give me the whole first column as a matrix?

read encrypted excel with pandas

I am scanning data through encrypted excel files with python. I would like to read the file content without opening excel. Here is my code. I usually use pandas to read files but pandas.read_excel do not allows to add password.
from xlrd import *
import win32com.client
import csv
import sys
xlApp = win32com.client.Dispatch("Excel.Application")
xlwb = xlApp.Workbooks.Open(path1+file_name, Password='password')
Thank you
Check and upvote if below lines help......
from xlrd import *
import win32com.client
import csv
import sys
import pandas as pd
from tempfile import NamedTemporaryFile
xlApp = win32com.client.Dispatch("Excel.Application")
filename,password = r'fullpath','password'
# Note this line from the question posted
xlwb = xlApp.Workbooks.Open(filename, False, True, None, password)
xlws = xlwb.Sheets(1) # index is from 1
print (xlws.Name)
print (xlws.Cells(1, 1)) # if you need cell values
f = NamedTemporaryFile(delete=False, suffix='.csv')
f.close()
os.unlink(f.name)
xlCSVWindows = 0x17 # CSV file format, from enum XlFileFormat
xlwb.SaveAs(Filename=f.name, FileFormat=xlCSVWindows) # Save as CSV
df = pd.read_csv(f.name)
print(df.head())
df.to_csv('myoutput.csv',index=False)

Categories

Resources