I used pyinstaller to pack the following script in an exe file with pyinstaller -F.
For clarity the script concat csv files in one file and export them to a new csv file.
# import necessary libraries
import pandas as pd
import os
import glob
from datetime import datetime
#Set filename
file_name = 'daglig_LCR_RLI'
# in the folder
path = os.path.dirname(os.path.abspath(__file__))
# Delete CSV file
# first check whether file exists or not
# calling remove method to delete the csv file
# in remove method you need to pass file name and type
del_file = path+"\\" + file_name +'.csv'
## If file exists, delete it ##
if os.path.isfile(del_file):
os.remove(del_file)
print("File deleted")
else: ## Show an error ##
print("File not found: " +del_file)
# use glob to get all the csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_list= list()
#format columns
dict_conv={'line_item': lambda x: str(x),
'column_item': lambda x: str(x)}
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f, sep=";", converters = dict_conv, encoding='latin1') #test latin1
df_list.append(df)
#print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
#add data frames to a list
RLI_combined = pd.concat(df_list, axis=0)
#Write date to approval_text
now = datetime.now()
# dd/mm/YY
print_date = now.strftime("%d/%m/%Y")
RLI_combined.loc[:, 'approval_text'] = print_date
#replace value_text with n/a
RLI_combined.loc[:, 'value_text'] = "n/a"
#Sum columns
m = RLI_combined['column_item'].isin(['0030', '0050', '0080'])
RLI_combined_sum = RLI_combined[~m].copy()
RLI_combined_sum['amount'] = RLI_combined_sum.groupby(['report_name', 'line_item', 'column_item'])['amount'].transform('sum')
RLI_combined_sum = RLI_combined_sum.drop_duplicates(['report_name', 'line_item', 'column_item'])
RLI_combined = pd.concat([RLI_combined_sum, RLI_combined[m]])
#export to csv
RLI_combined.to_csv(path + "//" + file_name + '.csv', index=False, sep=";", encoding='latin1')
#Make log
# Create the directory
directory = "Log"
parent_dir = path
# Path
path_log = os.path.join(parent_dir, directory)
try:
os.mkdir(path_log)
print('Log folder dannet')
except OSError as error:
print('Log folder eksisterer')
#export to csv
log_name = now.strftime("%d-%m-%Y_%H-%M-%S")
print(log_name)
RLI_combined.to_csv(path + "//" + 'Log' +"//" + file_name+'_' + log_name + '.csv', index=False, sep=";", encoding='latin1')
Everything works as intended when don't use pyinstaller. If I run the exe file after 10 sec of nothing it writes the following:
What am I doing wrong that causes the error? and could I improve performance so the exe file isn't that slow.
I hope you can point me in the right direction.
I believe I found the solution to the problem. I use anaconda and pyinstaller uses all the installed libaries on the machine.
So using a clean install of python and only installling the nescecary libaries might fix the problem.
The error seems to be a numpy error and the script isn't using that libary.
Related
I have a script that takes a few csv files and concat them, it works as intended when running it in the python shell, but it fails when I make an one file exe with pyinstaller.
This is the error I get when I run my script:
The part that seems to fail is this part:
# use glob to get all the csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_list= list()
#format columns
dict_conv={'line_item': lambda x: str(x),
'column_item': lambda x: str(x)}
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f, sep=";", converters = dict_conv, encoding='latin1') #test latin1
df_list.append(df)
#print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
#add data frames to a list
RLI_combined = pd.concat(df_list, axis=0)
This is my whole for context script:
# import necessary libraries
import pandas as pd
import os
import glob
from datetime import datetime
#Set filename
file_name = 'daglig_LCR_RLI'
# in the folder
path = os.path.dirname(os.path.abspath(__file__))
# Delete CSV file
# first check whether file exists or not
# calling remove method to delete the csv file
# in remove method you need to pass file name and type
del_file = path+"\\" + file_name +'.csv'
## If file exists, delete it ##
if os.path.isfile(del_file):
os.remove(del_file)
print("File deleted")
else: ## Show an error ##
print("File not found: " +del_file)
# use glob to get all the csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_list= list()
#format columns
dict_conv={'line_item': lambda x: str(x),
'column_item': lambda x: str(x)}
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f, sep=";", converters = dict_conv, encoding='latin1') #test latin1
df_list.append(df)
#print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
#add data frames to a list
RLI_combined = pd.concat(df_list, axis=0)
#Write date to approval_text
now = datetime.now()
# dd/mm/YY
print_date = now.strftime("%d/%m/%Y")
RLI_combined.loc[:, 'approval_text'] = print_date
#replace value_text with n/a
RLI_combined.loc[:, 'value_text'] = "n/a"
#Sum columns
m = RLI_combined['column_item'].isin(['0030', '0050', '0080'])
RLI_combined_sum = RLI_combined[~m].copy()
RLI_combined_sum['amount'] = RLI_combined_sum.groupby(['report_name', 'line_item', 'column_item'])['amount'].transform('sum')
RLI_combined_sum = RLI_combined_sum.drop_duplicates(['report_name', 'line_item', 'column_item'])
RLI_combined = pd.concat([RLI_combined_sum, RLI_combined[m]])
#export to csv
RLI_combined.to_csv(path + "//" + file_name + '.csv', index=False, sep=";", encoding='latin1')
#Make log
# Create the directory
directory = "Log"
parent_dir = path
# Path
path_log = os.path.join(parent_dir, directory)
try:
os.mkdir(path_log)
print('Log folder dannet')
except OSError as error:
print('Log folder eksisterer')
#export to csv
log_name = now.strftime("%d-%m-%Y_%H-%M-%S")
print(log_name)
RLI_combined.to_csv(path + "//" + 'Log' +"//" + file_name+'_' + log_name + '.csv', index=False, sep=";", encoding='latin1')
I hope you can point me in the right direction.
with the pyinstaller one file executable you will often ran into problems like that. When starting the *.exe it is extracted to a temporary directory and this is for example the start-location for relative path definitions.
So even if you get your script running and export your *.csv it will often be somewhere on your HD and not at the place of the *.exe where you perhaps expect it to be.
I think in your case the variable df_list stays empty because there are no files listed in csv_files. This is because in the temp dir (location is written in the top of the output) there are no *.csv files.
Please try printing the content of csv_files when running the one-file *.exe if this is the right guess
If this is the case start by running a one-dir *.exe and if this works you know that you have a problem with your path definitions
I have a folder with 50 .csv files. The .csv files are auto-generated and a results/ output from a process-based model (long and automatically named). For example, sandbox_username_vetch_scaleup_IA_1.csv; sandbox_username_vetch_scaleup_IA_2.csv, and it continues till sandbox_username_vetch_scaleup_IA_50.csv.
I am trying to shorten the file names in a way so that the files are names are IA_1, IA_2 ...up to IA_50 and subsequently the new .csv file name gets added as a column to the data frame. Here is what I have tried so far
# import necessary libraries
import pandas as pd
import os
import glob
import sys
from pathlib import Path
import re
data_p = "/Users/Username/Documents/HV_Scale/CWAD"
output_p = "/Users/Username/Documents/HV_Scale/CWAD"
retval = os.getcwd()
print (retval) # see in which folder you are
os.chdir(data_p) # move to the folder with your data
os.getcwd()
filenames = sorted(glob.glob('*.csv'))
fnames = list(filenames) # get the names of all your files
#print(fnames)
#Loop over
for f in range(len(fnames)):
print(f'fname: {fnames[f]}\n')
pfile = pd.read_csv(fnames[f], delimiter=",") # read in file
#extract filename
filename = fnames[f]
parts = filename.split(".") # giving you the number in file name and .csv
only_id = parts[0].split("_") # if there is a bracket included
# get IA from your file
filestate = pfile["IA"][0] # assuming this is on the first row
filestate = str(filestate)
# get new filename
newfilename = only_id[0]+"-"+filestate+parts[1]
# save your file (don't put a slash at the end of your directories on top)
pfile.to_csv(output_p+"/"+newfilename, index = False, header = True)
Here is the code for adding the csv file name as a column
import glob
import os
import shutil
import sys
import pandas as pd
path = '/Users/Username/Documents/HV_Scale/IA_CWAD/short'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in glob.glob(path+'\*.csv')]
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None )
file_df['file_name'] = file_
df = df.append(file_df)
#However, this adds the old csv file name and not the renamed one
In order to rename and move these files, all you need is:
import glob
import os
import shutil
import sys
SOURCE = '<Your source directory>'
TARGET = '<Your target directory>'
for file in glob.glob(os.path.join(SOURCE, '*_IA_*.csv')):
idx = file.index('_IA_')
filename = file[idx+1:]
target = os.path.join(TARGET, filename)
if os.path.exists(target):
print(f'Target file {target} already exists', file=sys.stderr)
else:
shutil.copy(file, target)
As there's nothing in the OP's question that tries to handle modification of the CSV files, that is left as an exercise for the OP.
Source and target directories should be different otherwise this can lead to ambiguous results
I try to convert a .xlsx file into .csv file whenever a new file is added into the Inputfolder and put the conversion .csv file in the OutputFolder.
import glob
import time
import os
import pandas as pd
#Get timestamp
timestr = time.strftime("%Y%m%d_%H%M%S")
#Input file path
input_filepath = 'C:/Documents/InputFile'
folderSize = 0
#Function to convert file
def format_csv(latest_file):
#Output file path
filenamepath = 'C:/Documents/OutputFile/' + timestr + '.csv'
read_Excelfile = pd.read_excel(latest_file)
read_Excelfile.to_csv(filenamepath, index=None, header=True)
while True:
checkFolder = folderSize
folderSize = 0
#Check the size of the Input Folder
for path, dirs, files in os.walk(input_filepath):
for f in files:
fp = os.path.join(path, f)
folderSize += os.path.getsize(fp)
print(folderSize)
#Create new .csv file if the Input folder has new file added
if(folderSize > checkFolder):
list_of_files = glob.glob('C:/Documents/InputFile/*.xlsx')
latest_file = max(list_of_files, key=os.path.getctime)
format_csv(latest_file)
print(latest_file)
time.sleep(15)
Right now the program will only convert the first .xlsx file only. If I add a new .xlsx file into InputFolder, the file is not converted.
You could try something like reading through the folder for all .xlsx files if it finds one convert that to .csv
Here we are reading through the directory for all xlsx files, converting them by creating copies in csv version and then deleting the original xlsx version
import pandas as pd
import os
path = 'C:/Documents/InputFile'
files = os.listdir(path)
for file in files:
if '.xlsx' in file:
filename = file[:-5]
new_filename = path + "/" + filename + ".csv"
if filename + ".csv" in files:
pass
else:
df = pd.read_excel(file)
df.to_csv(new_filename)
I already improvise my original code. So, whenever I put a new excel file into InputFolder, the program will convert the file to .csv format and insert the formatted file in OutputFolder
import glob
import time
import os
import pandas as pd
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
#Function if new file is created in the folder
def on_created(event):
list_of_files = glob.glob('C:/Users/Documents/InputFolder/*.xlsx')
latest_file = max(list_of_files, key=os.path.getctime)
format_csv(latest_file)
#Function to convert .xlsx to .csv
def format_csv(latest_file):
# Get timestamp
timestr = time.strftime("%d%m%Y_%H%M%S")
#Output file path
filenamepath = 'C:/Users/Documents/OutputFolder/' + timestr + '.csv'
read_Excelfile = pd.read_excel(latest_file)
read_Excelfile.to_csv(filenamepath, index=None, header=True)
print(filenamepath)
if __name__ == "__main__":
event_handler = FileSystemEventHandler()
#Calling function for file insertion
event_handler.on_created = on_created
#Input Folder
path = 'C:/Users/Documents/InputFolder'
#Function to observe file
observer = Observer()
observer.schedule(event_handler, path, recursive=True)
observer.start()
try:
#Check every one second
while True:
time.sleep(1)
except KeyboardInterrupt:
#Program stop if keyboard interupt
observer.stop()
observer.join()
I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)
I have multiple directories, each of which containing any number of .xls files.
I'd like to take the files in any given directory and combine them into one .xls file, using the file names as the tab names.
For example if there are the files NAME.xls, AGE.xls, LOCATION.xls, I'd like to combine them into a new file with the data from NAME.xls on a tab called NAME, the data from AGE.xls on a tab called AGE and so on.
Each source .xls file only has one column of data with no headers.
This is what I have so far, and well it's not working.
Any help would be greatly appreciated (I'm fairly new to Python and I've never had to do anything like this before).
wkbk = xlwt.Workbook()
xlsfiles = glob.glob(os.path.join(path, "*.xls"))
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
tabNames = []
for OF in onlyfiles:
if str(OF)[-4:] == ".xls":
sheetName = str(OF)[:-4]
tabNames.append(sheetName)
else:
pass
for TN in tabNames:
outsheet = wkbk.add_sheet(str(TN))
data = pd.read_excel(path + "\\" + TN + ".xls", sheet_name="data")
data.to_excel(path + "\\" + "Combined" + ".xls", sheet_name = str(TN))
Here is a small helper function - it supports both .xls and .xlsx files:
import pandas as pd
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
def merge_excel_files(dir_name, out_filename='result.xlsx', **kwargs):
p = Path(dir_name)
with pd.ExcelWriter(out_filename) as xls:
_ = [pd.read_excel(f, header=None, **kwargs)
.to_excel(xls, sheet_name=f.stem, index=False, header=None)
for f in p.glob('*.xls*')]
Usage:
merge_excel_files(r'D:\temp\xls_directory', 'd:/temp/out.xls')
merge_excel_files(r'D:\temp\xlsx_directory', 'd:/temp/out.xlsx')
Can you try
import pandas as pd
import glob
path = 'YourPath\ToYour\Files\\' # Note the \\ at the end
# Create a list with only .xls files
list_xls = glob.glob1(path,"*.xls")
# Create a writer for pandas
writer = pd.ExcelWriter(path + "Combined.xls", engine = 'xlwt')
# Loop on all the files
for xls_file in list_xls:
# Read the xls file and the sheet named data
df_data = pd.read_excel(io = path + xls_file, sheet_name="data")
# Are the sheet containing data in all your xls file named "data" ?
# Write the data into a sheet named after the file
df_data.to_excel(writer, sheet_name = xls_file[:-4])
# Save and close your Combined.xls
writer.save()
writer.close()
Let me know if it works for you, I never tried engine = 'xlwt' as I don't work with .xls file but .xlsx